### Simple Sentence Generation
This notebook demonstrates how to use an RNN paired with rule-based corpus sentence extraction to train a generative model for creating simple sentences. This model can be used for eg generation of simple practice sentences for language learning applications. Here the generative model is mainly employed as a convenience for storing and reproducing the prototypes identified by the corpus-based approach, although one maintains the ability to apply minor perturbations to generated text without the need to hard-code exhaustive grammatical rules.

In [25]:
import pandas as pd
from enum import Enum
import nltk
from nltk.corpus import wordnet as wn
from spacy.matcher import Matcher
import spacy
import en_core_web_sm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import csv
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np
import os
import time
from tqdm.notebook import tqdm
import pickle

In [2]:
nlp = en_core_web_sm.load()
passive_matcher = Matcher(nlp.vocab)
passive_rule = [{'DEP':'nsubjpass'}, {'DEP': 'auxpass'}]
passive_matcher.add('Passive',[passive_rule])

clause_matcher = Matcher(nlp.vocab)
clause_rule = [{'DEP':'det', 'OP': '*'}, {'DEP':'amod', 'OP': '*'}, 
               {'DEP':'nsubj'}, {'DEP':'ROOT'}, {'DEP':'det', 'OP': '*'}, 
               {'DEP': 'dobj', 'OP': '*'}, {'DEP': 'prep'}, {'DEP': 'pobj'}]
clause_matcher.add('Clause',[clause_rule])

pp_matcher = Matcher(nlp.vocab)
pp_rule = [{'DEP':'prep'}, {'DEP':'det', 'OP': '*'}, {'DEP':'amod', 'OP': '*'}, {'DEP':'pobj'}]
pp_matcher.add('PrepPhrase',[pp_rule])

In [None]:
# load data
df = pd.read_csv('res/verb_table.csv')
corpus_filename = 'D:\\archive\\20200625\\en-es.txt\\subtitles.en'   

In [3]:
# Class definitions.
class Tense(Enum):
    SIMPLEPRES = 0
    SIMPLEPAST = 1
    PRESPERFECT = 2
    PASTPERFECT = 3
    PRESPROG = 4
    PASTPROG = 5
    PASTPERFPROG = 6
    
    def __str__(self):
        if self == Tense.SIMPLEPRES:
            return 'Simple present'
        if self == Tense.SIMPLEPAST:
            return 'Simple past'
        if self == Tense.PRESPERFECT:
            return 'Present perfect'
        if self == Tense.PASTPERFECT:
            return 'Past perfect'
        if self == Tense.PRESPROG:
            return 'Present progressive'
        if self == Tense.PASTPROG:
            return 'Past progressive'
        if self == Tense.PASTPERFPROG:
            return 'Past perfect progressive'
        
    def __repr__(self):
        return str(self)
    

class Person(Enum):
    I = 0
    WETHEY = 1
    YOU = 2
    HSI = 3
    
    def __str__(self):
        if self == Person.I:
            return 'I'
        if self == Person.WETHEY:
            return 'We | they'
        if self == Person.YOU:
            return 'You'
        if self == Person.HSI:
            return 'He | she | it'
        
    def __repr__(self):
        return str(self)
    

class Verb:
    def __init__(self, df_row):
        self.lemma = df_row['lemma']
        # Organize tenses and person in dictionary with (tense, person) key
        self.forms = {}
        # Simple present for all four persons
        self.forms[(Tense.SIMPLEPRES, Person.I)] = df_row['i_pres']
        self.forms[(Tense.SIMPLEPRES, Person.WETHEY)] = df_row['wethey_pres']
        self.forms[(Tense.SIMPLEPRES, Person.YOU)] = df_row['you_pres']
        self.forms[(Tense.SIMPLEPRES, Person.HSI)] = df_row['hsi_pres']
        # Simple past for all four persons
        self.forms[(Tense.SIMPLEPAST, Person.I)] = df_row['i_past']
        self.forms[(Tense.SIMPLEPAST, Person.WETHEY)] = df_row['wethey_past']
        self.forms[(Tense.SIMPLEPAST, Person.YOU)] = df_row['you_past']
        self.forms[(Tense.SIMPLEPAST, Person.HSI)] = df_row['hsi_past']
        # Other tenses can be formed by the combination of present or past participle with corresponding form of "be"
        # Persent perfect
        self.forms[(Tense.PRESPERFECT, Person.I)] = 'have ' + df_row['pastpart']
        self.forms[(Tense.PRESPERFECT, Person.WETHEY)] = 'have ' + df_row['pastpart']
        self.forms[(Tense.PRESPERFECT, Person.YOU)] = 'have ' + df_row['pastpart']
        self.forms[(Tense.PRESPERFECT, Person.HSI)] = 'has ' + df_row['pastpart']
        # Past perfect
        self.forms[(Tense.PASTPERFECT, Person.I)] = 'had ' + df_row['pastpart']
        self.forms[(Tense.PASTPERFECT, Person.WETHEY)] = 'had ' + df_row['pastpart']
        self.forms[(Tense.PASTPERFECT, Person.YOU)] = 'had ' + df_row['pastpart']
        self.forms[(Tense.PASTPERFECT, Person.HSI)] = 'had ' + df_row['pastpart']
        # Present progressive
        self.forms[(Tense.PRESPROG, Person.I)] = 'am ' + df_row['prespart']
        self.forms[(Tense.PRESPROG, Person.WETHEY)] = 'are ' + df_row['prespart']
        self.forms[(Tense.PRESPROG, Person.YOU)] = 'are ' + df_row['prespart']
        self.forms[(Tense.PRESPROG, Person.HSI)] = 'is ' + df_row['prespart']
        # Past progressive
        self.forms[(Tense.PRESPROG, Person.I)] = 'was ' + df_row['prespart']
        self.forms[(Tense.PRESPROG, Person.WETHEY)] = 'were ' + df_row['prespart']
        self.forms[(Tense.PRESPROG, Person.YOU)] = 'were ' + df_row['prespart']
        self.forms[(Tense.PRESPROG, Person.HSI)] = 'was ' + df_row['prespart']
        # Past perfect progressive
        self.forms[(Tense.PASTPERFPROG, Person.I)] = 'have been ' + df_row['prespart']
        self.forms[(Tense.PASTPERFPROG, Person.WETHEY)] = 'have been ' + df_row['prespart']
        self.forms[(Tense.PASTPERFPROG, Person.YOU)] = 'have been ' + df_row['prespart']
        self.forms[(Tense.PASTPERFPROG, Person.HSI)] = 'has been ' + df_row['prespart']

    def get_form(tense, person):
        ''' Return the correct verb phrase given the tense and the form '''
        return self.forms[tense, person]
   
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [5]:
# Function definitions.
def get_examples(lemma, verb_set):
    examples = []     
    with open(corpus_filename,'rb') as f:
        line = f.readline()
        cnt = 1
        while line:
            line_formatted = line.strip().decode('utf-8').replace('"', '').replace('-', '')

            if any(i in line_formatted for i in verb_set):
                examples.append(line_formatted.lower())
            line = f.readline()
            cnt += 1
            if cnt % 1000000 == 0:
                print('.', end='')
                    
                        
    return examples


def cosine_sim_vectors(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    
    return cosine_similarity(vec1, vec2)[0][0]


def get_pieces(examples_list, lemma, verbose=False):
    
    pieces = []
    
    lemma_matcher = Matcher(nlp.vocab)
    lemma_rule = [{'DEP':'ROOT', 'LEMMA': lemma, 'POS': 'VERB'}]
    lemma_matcher.add('LemmaInclusive',None,lemma_rule)
    
    for item in examples_list:
        doc = nlp(item)
        
        for sent in doc.sents:
            sent_split = sent.text
            sent_split_doc = nlp(sent_split)
            clause = clause_matcher(sent_split_doc)
            lemma_check = lemma_matcher(sent_split_doc)        
        
            if clause and lemma_check and '?' not in sent_split:
                original = sent_split
                d_np = []
                m_np = []
                h_np = []

                verb = []

                d_vp = []
                m_vp = []
                h_vp = []

                prep = []

                d_pp = []
                m_pp = []
                h_pp = []

                for tok in sent_split_doc:
                    if tok.dep_ == 'nsubj':     
                        d_np = [desc for desc in tok.subtree if (desc.dep_ == "det")]
                        m_np = [desc for desc in tok.subtree if (desc.dep_ == "amod")]    
                        h_np = [tok]

                    elif tok.dep_ == 'ROOT':
                        verb = [tok]

                    elif tok.dep_ == 'dobj':
                        d_vp = [desc for desc in tok.subtree if (desc.dep_ == "det")]
                        m_vp = [desc for desc in tok.subtree if (desc.dep_ == "amod")]
                        h_vp = [tok]

                    else:
                        pass

                p_phrase = pp_matcher(sent_split_doc)
                if p_phrase:
                    for match_id, start, end in p_phrase:
                        string_id = nlp.vocab.strings[match_id]
                        span = doc[start:end]
                        p_doc = nlp(span.text)
                    prep = [[tok for tok in p_doc if (tok.dep_ == "ROOT" or 'prep')][0]]  
                    d_pp = [tok for tok in p_doc if (tok.dep_ == "det")]
                    m_pp = [tok for tok in p_doc if (tok.dep_ == "amod")]
                    h_pp = [tok for tok in p_doc if (tok.dep_ == "pobj")]


                dict_pieces = {

                        'd_np': d_np,
                        'm_np': m_np,
                        'h_np': h_np,
                        'verb': verb,
                        'd_vp': d_vp,
                        'm_vp': m_vp,
                        'h_vp': h_vp,
                        'prep': prep,
                        'd_pp': d_pp,
                        'm_pp': m_pp,
                        'h_pp': h_pp

                }
                
                pieces_cleaned = []
                # This will need to be altered so we can also account for empty objects etc.
                for key, value in dict_pieces.items():
                    if len(value) > 0:
                        if key == 'verb':
                            pieces_cleaned.append(value[0].text)
                        else:
                            pieces_cleaned.append(value[0].text)
                        
                proposed = ' '.join(pieces_cleaned) + "."

                vectorizer = CountVectorizer().fit_transform([original, proposed])
                vectors = vectorizer.toarray()
                csim = cosine_similarity(vectors)
                similarity = cosine_sim_vectors(vectors[0], vectors[1])
                
                if similarity > 0.84:
                    d = {'source': original, 'parsed': proposed, 'similarity': similarity}
                    pieces.append(d)
                    if verbose is True:
                        print('\t' + proposed)
                        print('\n')
                    
    return pieces


def export_examples(examples_list, lemma):
    filedata = 'example_sentences_lemmata.csv'
    fieldnames = ['lemma', 'examples']
    examples = []
    with open(filedata, 'a', newline='', encoding='utf-8') as f1:
        csv_output1 = csv.DictWriter(f1, delimiter=',', fieldnames=fieldnames)
        for item in examples_list:
            examples.append(item['parsed'])
        csv_output1.writerow({'lemma': lemma, 'examples': examples})
        
        
def export_sentences_flat(examples):
    with open('res/example_sentences_flat.txt', 'a') as f:
        for item in examples:
            f.write(item['parsed'])
            f.write('\n')
            
def build_corpus(verbs, start, end):
    # reset the sentences corpus
    with open('res/example_sentences_flat.txt', 'w') as f:
        f.write('')
        
    for verb in tqdm(verbs[start:end]):
        lemma = verb.lemma
        verb_forms = verb.forms.items()
        verb_set = set()
        for key, value in verb_forms:
            verb_set.add(value)

        print('Examining verb: {}'.format(lemma))
        example_sentences = get_examples(lemma, verb_set)
        print('\tNumber of examples: {}\n'.format(len(example_sentences)))
        print('\nExtracting plausible examples ...\n')
        pieces = get_pieces(example_sentences, lemma)
        pieces_sorted = sorted(pieces, key=lambda k: k['similarity'], reverse=True)
        print('Number of plausible example sentences: {}'.format(len(pieces_sorted)))
        export_sentences_flat(pieces_sorted)
        
        
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


def text_from_ids(ids, id_to_char):
    return tf.strings.reduce_join(id_to_char(ids), axis=-1)


def train_model():
    with open('res/example_sentences_flat.txt', 'r', encoding='utf-8') as f:
        texts = f.read().splitlines()
        
    text = ' '.join([t.ljust(100, '.') for t in texts])
    vocab = sorted(set(text))
    chars = tf.strings.unicode_split(texts, input_encoding='UTF-8')
    char_to_id = preprocessing.StringLookup(
        vocabulary=list(vocab),
        mask_token=None,
    )
    id_to_char = preprocessing.StringLookup(
        vocabulary=char_to_id.get_vocabulary(),
        invert=True,
        mask_token=None,
    )
    # pad all texts to 100 so we can easily batch the entire dataset using tf methods
    all_ids = char_to_id(tf.strings.unicode_split(text, 'UTF-8'))
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    seq_length = 100
    examples_per_epoch = len(text)//(seq_length+1)
    sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
    dataset = sequences.map(split_input_target)
    vocab_size = len(vocab)
    embedding_dim = 256
    rnn_units = 1024
    BUFFER_SIZE = 10000
    dataset = (
        dataset
        .shuffle(BUFFER_SIZE)
        .batch(BATCH_SIZE, drop_remainder=True)
        .prefetch(tf.data.experimental.AUTOTUNE))

    model = Model(
        # Be sure the vocabulary size matches the `StringLookup` layers.
        vocab_size=len(char_to_id.get_vocabulary()),
        embedding_dim=embedding_dim,
        rnn_units=rnn_units)
        
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer='adam', loss=loss)
    history = model.fit(dataset, epochs=30, callbacks=[])
    
    return id_to_char, char_to_id, model


def predict_with_onestep(query, temperature, verbose=False):
    one_step_model = OneStep(model, id_to_char, char_to_id, temperature=temperature)
    start = time.time()
    states = None
    next_char = tf.constant([query])
    result = [next_char]

    for n in range(100):
        next_char, states = one_step_model.generate_one_step(next_char, states=states)
        result.append(next_char)
        if next_char == '.':
            break

    result = tf.strings.join(result)
    end = time.time()
    print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
    if verbose is True:
        print('\nRun time:', end - start)
        
    return one_step_model

In [6]:
BATCH_SIZE = 64

In [None]:
verbs = []
for index, row in df.iterrows():
    verbs.append(Verb(row))
    
build_corpus(verbs, 4, -1)

In [None]:
df = pd.read_csv('res/example_sentences.csv')
with open('res/example_sentences_flat.txt', 'w') as f:
    for tpl in df.itertuples():
        for text in eval(tpl.examples):
            f.write(text.lower())
            f.write('\n')

In [7]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
id_to_char, char_to_id, model = train_model()

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [11]:
# increasing temperature encourages the model to deviate from the training examples
for v in np.linspace(0.1, 2, 10):
    print(v)
    predict_with_onestep("he ", v)
    print('\n')

0.1
he shaved the faces of gentlemen. 

________________________________________________________________________________


0.3111111111111111
he shaved the faces of gentlemen. 

________________________________________________________________________________


0.5222222222222223
he gave a measure of sugar. 

________________________________________________________________________________


0.7333333333333333
he drove boss for ferraldo. 

________________________________________________________________________________


0.9444444444444444
he fell in love. 

________________________________________________________________________________


1.1555555555555557
he robbed the science of someone. 

________________________________________________________________________________


1.3666666666666667
he owns the girl for number. 

________________________________________________________________________________


1.577777777777778
he tires the city of government. 

______________________________

In [12]:
# we can now easily generate a variety of grammatical sentences
for i in range(10):
    predict_with_onestep("he ", 0.5)
    print('\n')

he bled to death. 

________________________________________________________________________________


he practiced karate on the weekends. 

________________________________________________________________________________


he shares with me. 

________________________________________________________________________________


he found a picture of you. 

________________________________________________________________________________


he fell in love. 

________________________________________________________________________________


he fell in love. 

________________________________________________________________________________


he slipped by me. 

________________________________________________________________________________


he retired in florida. 

________________________________________________________________________________


he brought this on himself. 

________________________________________________________________________________


he forgot about it. 

_________

In [30]:
tf.saved_model.save(OneStep(model, id_to_char, char_to_id), 'res/one_step')





INFO:tensorflow:Assets written to: res/one_step\assets


INFO:tensorflow:Assets written to: res/one_step\assets
