# # Random grammar generation

# Creates a scale-free random grammar with specified parameters

We start by obtaining the parameters of the grammar

In [1]:
# Sliders for grammar parameters
from ipywidgets import interact, interactive, fixed, interact_manual

def GetParams(num_words, num_classes, num_class_connectors, connectors_limit):
    return (num_words, num_classes, num_class_connectors, connectors_limit)
grammar_params = interactive(GetParams, 
                             num_words = (10, 50), # vocabulary size
                             num_classes = (3, 10), # number of grammar classes
                             num_class_connectors = (5, 40), # connectors between grammar classes (directed)
                             connectors_limit = (1, 5)) # max connectors a word can have
display(grammar_params)

interactive(children=(IntSlider(value=30, description='num_words', max=50, min=10), IntSlider(value=6, descrip…

In [2]:
# Set Grammar parameters
num_words = grammar_params.result[0]
num_classes = grammar_params.result[1]
num_class_connectors = grammar_params.result[2]
connectors_limit = grammar_params.result[3]

In [80]:
# Populate grammar classes following a Zipf distribution
from fractions import Fraction
import numpy as np

harmonic_number = sum(Fraction(1, d) for d in range(1, num_classes + 1))

zipf_fracs = [1 / x / harmonic_number for x in range(1, num_classes + 1)]

words_per_class = np.array(np.round(np.array(zipf_fracs) * num_words), dtype = "int")
cumul_words = np.cumsum(words_per_class) # boundaries for class words

[12  6  4  3]
[12 18 22 25]


In [74]:
# Plot vocab fraction vs rank for grammar classes
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True)         # initiate notebook for offline plot

class_freqs = Scatter(
    x = np.array(range(1, num_classes + 1)),
    y = words_per_class / num_words
)
Zipf_law = Scatter(
    x = np.array(range(1, num_classes + 1)),
    y = zipf_fracs
)
data = [class_freqs, Zipf_law]
iplot(data)

In [75]:
# Create random connectors between grammar classes
import random as rand
connectors = set()
for i in range(0, num_class_connectors - 1):
    randint1 = rand.randint(0, num_classes - 1)
    randint2 = rand.randint(0, num_classes - 1)
    if randint1 != randint2: # avoid classes connecting to themselves; may cause less connections than param
        connectors.add((randint1, randint2))

#connectors = set([(rand.randint(0, num_classes - 1), rand.randint(0, num_classes - 1)) for i in range(0, num_class_connectors)])
print(connectors)

{(3, 0), (3, 2), (1, 3)}


In [76]:
# Assign connectors to classes
# Translate connectors into connector labels
connectors_dict = {k:[] for k in range(num_classes)}
connectors_dict_text = {k:[] for k in range(num_classes)}
for connector in connectors:
    connector_text = "C" + str(connector[0]) + "_" + str(connector[1])
    connectors_dict[connector[0]].append(connector)
    connectors_dict[connector[1]].append(connector)
    connectors_dict_text[connector[0]].append(connector_text + "+")
    connectors_dict_text[connector[1]].append(connector_text + "-")
    
print(connectors_dict_text)
print(connectors_dict)

{0: ['C3_0-'], 1: ['C1_3+'], 2: ['C3_2-'], 3: ['C3_0+', 'C3_2+', 'C1_3-']}
{0: [(3, 0)], 1: [(1, 3)], 2: [(3, 2)], 3: [(3, 0), (3, 2), (1, 3)]}


In [77]:
# Build the disjuncts randomly, with some directives
dict_disjuncts = {}
for gramm_class, connects in connectors_dict.items():
     # don't conjunct more connectors than available ones, nor than limit
    max_connectors = min(connectors_limit, len(connects))
    disjuncts = []
    
    for connector in connects: # create one conjunct per connector; arbitrary choice
        num_connectors = rand.randint(1, max_connectors) # determine how many connectors for this conjunct
        conjunct = [connector] # current connector always goes in conjunct
        
        diff_connects = connects[:] # make independent copy
        diff_connects.remove(connector) # don't repeat connector in a conjunct
        conjunct.extend(rand.sample(diff_connects, num_connectors - 1)) # add random connectors to conjunct; no repeats
            
        disjuncts.append(conjunct)
        
    dict_disjuncts[gramm_class] = set(tuple(d) for d in disjuncts) # set eliminates duplicate disjuncts

print(dict_disjuncts)

{0: {((3, 0),)}, 1: {((1, 3),)}, 2: {((3, 2),)}, 3: {((3, 2),), ((1, 3),), ((3, 0), (3, 2))}}


In [96]:
# Translate grammar to dictionary format
grammar_text = f"""
% RANDOM GRAMMAR with parameters:
% num_words = {num_words}
% num_classes = {num_classes}
% num_class_connectors = {num_class_connectors}
% connectors_limit = {connectors_limit}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
"""

for curr_class, disjunct in dict_disjuncts.items():
    paragraph = f"% Class: {curr_class}\n"
    # Calculate initial word_id for class
    if curr_class == 0:
        lower_id = 0
    else:
        lower_id = cumul_words[curr_class - 1]
        
    class_words = [f'"W{i}_{curr_class}"' for i in range(lower_id, cumul_words[curr_class])]
    paragraph += ", ".join(class_words) + ":\n"
    
print(paragraph)
print(grammar_text)

% Class: 3
"W22_3", "W23_3", "W24_3":


% RANDOM GRAMMAR with parameters:
% num_words = 25
% num_classes = 4
% num_class_connectors = 8
% connectors_limit = 2
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



# As grammar is ready, let's generate corpus from it

In [37]:
# Corpus parameters
num_sentences = 10

In [16]:

seed_words = [rand.randint(0, num_words - 1) for i in range(0, num_sentences - 1)]

np.digitize(seed_words, cumul_words)

array([0, 3, 1, 0, 1, 2, 3, 5, 0])

In [13]:
for counter in range(0, num_sentences - 1):
    # Choose random word as sentence seed
    seed_id = rand.randint(0, num_words - 1)
    seed_class = 

SyntaxError: invalid syntax (<ipython-input-13-70bdf8776fdf>, line 5)

In [17]:
# Choose class of seed words (this brings a Zipfian dist of words??)
class_seeds = [rand.randint(0, num_classes - 1) for i in range(0, num_sentences - 1)]
for i in range(0, num_sentences - 1):
    seed_class = rand.randint(0, num_classes - 1)
    curr_conjunct = rand.sample(disjuncts[seed_class], 1)[0]
    