# Random grammar generation

# Creates a scale-free random grammar with specified parameters

We start by obtaining the parameters of the grammar

In [14]:
# Sliders for grammar parameters
from ipywidgets import interact, interactive, fixed, interact_manual

def GetParams(num_words, num_classes, num_class_connectors, connectors_limit):
    return (num_words, num_classes, num_class_connectors, connectors_limit)
grammar_params = interactive(GetParams, 
                             num_words = (15, 34), # vocabulary size
                             num_classes = (3, 5), # number of grammar classes
                             num_class_connectors = (5, 11), # connectors between grammar classes (directed)
                             connectors_limit = (1, 3)) # max connectors a word can have
display(grammar_params)

interactive(children=(IntSlider(value=24, description='num_words', max=34, min=15), IntSlider(value=4, descrip…

In [15]:
# Set Grammar parameters
num_words = grammar_params.result[0]
num_classes = grammar_params.result[1]
num_class_connectors = grammar_params.result[2]
connectors_limit = grammar_params.result[3]

In [16]:
# Populate grammar classes following a Zipf distribution
from fractions import Fraction
import numpy as np

harmonic_number = sum(Fraction(1, d) for d in range(1, num_classes + 1))

zipf_fracs = [1 / x / harmonic_number for x in range(1, num_classes + 1)]

words_per_class = np.array(np.round(np.array(zipf_fracs) * num_words), dtype = "int")
cumul_words = np.cumsum(words_per_class) # boundaries for class words

In [17]:
# Plot vocab fraction vs rank for grammar classes
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True)         # initiate notebook for offline plot

class_freqs = Scatter(
    x = np.array(range(1, num_classes + 1)),
    y = words_per_class / num_words
)
Zipf_law = Scatter(
    x = np.array(range(1, num_classes + 1)),
    y = zipf_fracs
)
data = [class_freqs, Zipf_law]
iplot(data)

In [18]:
# Create random connectors between grammar classes
import random as rand
connectors = set()
for i in range(0, num_class_connectors - 1):
    randint1 = rand.randint(0, num_classes - 1)
    randint2 = rand.randint(0, num_classes - 1)
    if randint1 != randint2: # avoid classes connecting to themselves; may cause less connections than param
        connectors.add((randint1, randint2))

#connectors = set([(rand.randint(0, num_classes - 1), rand.randint(0, num_classes - 1)) for i in range(0, num_class_connectors)])
print(connectors)

{(1, 2), (3, 0), (1, 0), (3, 1), (0, 3), (0, 2)}


In [19]:
# Assign connectors to classes
# Translate connectors into connector labels
connectors_dict = {k:[] for k in range(num_classes)}
connectors_dict_text = {k:[] for k in range(num_classes)}
for connector in connectors:
    connector_text = "C" + str(connector[0]) + "_" + str(connector[1])
    connectors_dict[connector[0]].append(connector)
    connectors_dict[connector[1]].append(connector)
    connectors_dict_text[connector[0]].append(connector_text + "+")
    connectors_dict_text[connector[1]].append(connector_text + "-")
    
print(connectors_dict_text)
print(connectors_dict)

{0: ['C3_0-', 'C1_0-', 'C0_3+', 'C0_2+'], 1: ['C1_2+', 'C1_0+', 'C3_1-'], 2: ['C1_2-', 'C0_2-'], 3: ['C3_0+', 'C3_1+', 'C0_3-']}
{0: [(3, 0), (1, 0), (0, 3), (0, 2)], 1: [(1, 2), (1, 0), (3, 1)], 2: [(1, 2), (0, 2)], 3: [(3, 0), (3, 1), (0, 3)]}


In [20]:
# Build the disjuncts randomly, with some directives
dict_disjuncts = {}
for gramm_class, connects in connectors_dict.items():
     # don't conjunct more connectors than available ones, nor than limit
    max_connectors = min(connectors_limit, len(connects))
    disjuncts = []
    
    for connector in connects: # create one conjunct per connector; arbitrary choice
        num_connectors = rand.randint(1, max_connectors) # determine how many connectors for this conjunct
        conjunct = [connector] # current connector always goes in conjunct
        
        diff_connects = connects[:] # make independent copy
        diff_connects.remove(connector) # don't repeat connector in a conjunct
        conjunct.extend(rand.sample(diff_connects, num_connectors - 1)) # add random connectors to conjunct; no repeats
            
        disjuncts.append(conjunct)
        
    dict_disjuncts[gramm_class] = set(tuple(d) for d in disjuncts) # set eliminates duplicate disjuncts

print(dict_disjuncts)

{0: {((0, 2),), ((1, 0), (0, 2)), ((3, 0),), ((0, 3), (1, 0))}, 1: {((1, 0), (1, 2)), ((3, 1), (1, 0)), ((1, 2), (1, 0))}, 2: {((1, 2),), ((0, 2), (1, 2))}, 3: {((0, 3),), ((3, 0),), ((3, 1),)}}


In [36]:
# Translate grammar to dictionary format
grammar_text = f"""
% RANDOM GRAMMAR with parameters:
% num_words = {num_words}
% num_classes = {num_classes}
% num_class_connectors = {num_class_connectors}
% connectors_limit = {connectors_limit}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

"""

for curr_class, disjunct in dict_disjuncts.items():
    class_entry = f"% Class: {curr_class}\n" # Description of current disjunct
    # Calculate initial word_id for class
    if curr_class == 0:
        lower_id = 0
    else:
        lower_id = cumul_words[curr_class - 1]
        
    # Add word list to class_entry
    class_words = [f'"W{i}_{curr_class}"' for i in range(lower_id, cumul_words[curr_class])]
    class_entry += ", ".join(class_words) + ":\n"
    
    # Add every conjunct to disjunct
    curr_disjunct = []
    for conjunct in disjunct:
        curr_conjunct = []
        for connector in conjunct:
            connector_text = "C" + str(connector[0]) + "_" + str(connector[1])
            sign = "+" if connector[0] == curr_class else "-" # choose connector sign
            curr_conjunct.append(connector_text + sign)
        curr_disjunct.append(" & ".join(curr_conjunct))
        
    class_entry += "(" + ") or (".join(curr_disjunct) + ");\n\n"
    grammar_text += class_entry
    
print(grammar_text)


% RANDOM GRAMMAR with parameters:
% num_words = 24
% num_classes = 4
% num_class_connectors = 8
% connectors_limit = 2
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Class: 0
"W0_0", "W1_0", "W2_0", "W3_0", "W4_0", "W5_0", "W6_0", "W7_0", "W8_0", "W9_0", "W10_0", "W11_0":
(C0_2+) or (C1_0- & C0_2+) or (C3_0-) or (C0_3+ & C1_0-);

% Class: 1
"W12_1", "W13_1", "W14_1", "W15_1", "W16_1", "W17_1":
(C1_0+ & C1_2+) or (C3_1- & C1_0+) or (C1_2+ & C1_0+);

% Class: 2
"W18_2", "W19_2", "W20_2", "W21_2":
(C1_2-) or (C0_2- & C1_2-);

% Class: 3
"W22_3", "W23_3", "W24_3":
(C0_3-) or (C3_0+) or (C3_1+);




In [29]:
for i in {3,4,5}:
    print(i)

3
4
5


# As grammar is ready, let's generate corpus from it

In [9]:
# Corpus parameters
num_sentences = 10

In [10]:

seed_words = [rand.randint(0, num_words - 1) for i in range(0, num_sentences - 1)]

np.digitize(seed_words, cumul_words)

array([4, 0, 0, 5, 2, 0, 1, 4, 1])

In [11]:
for counter in range(0, num_sentences - 1):
    # Choose random word as sentence seed
    seed_id = rand.randint(0, num_words - 1)
    seed_class = 

SyntaxError: invalid syntax (<ipython-input-11-970184b3c07e>, line 4)

In [None]:
# Choose class of seed words (this brings a Zipfian dist of words??)
class_seeds = [rand.randint(0, num_classes - 1) for i in range(0, num_sentences - 1)]
for i in range(0, num_sentences - 1):
    seed_class = rand.randint(0, num_classes - 1)
    curr_conjunct = rand.sample(disjuncts[seed_class], 1)[0]
    