# Demo version of the descriptive sentence generator for the taboo implementation project

## Import

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import random
import string
import torch
import torch.nn as nn
from torch.autograd import Variable
import math
import os
import pickle
import time
import gs_probdist as gspd
import semrel as sr
import gensim
import cardgen as cg

## Loading gensim model to use the card generator and the semantic relations finder

In [16]:
card_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

## Reading and structuring corpus

In [17]:
#opening and reading the corpus
#we will be using the full version of the descriptive corpus we made ~115k sentences
f = open('description-corpus-115k.txt', 'r')
text = f.readlines() # List with sentences as elements
f.close()

# getting lower case and splitting it
sentences = [text[i].lower().split() for i in range(len(text))]

#getting the avg length of a sentence
lengths = [len(sent) for sent in sentences]
avg_sent_length = sum(lengths)/len(lengths) # ~27

## Setting up trigrams, context and target tensors

In [4]:
# Sentence by sentence
# this structure allows us to create context/target sets for each word. 
trigrams = []
for sentence in sentences:
    trigrams += [([sentence[i], sentence[i+1]], sentence[i+2]) for i in range(len(sentence) - 2)]


#using all trigrams led to kernel death every time
# we will randomly sample 50000 of them
random.seed(163)
trigrams = random.sample(trigrams, 50000)

In [20]:
# getting set of words in vocab, it's length and the frequency of each word
# our vocab consists of the words appearing in trigrams, so no need to take the vocab over the whole text if we are not using all trigrams.
voc = set()
for tri in trigrams:
    voc = voc.union(set(np.union1d(np.array(tri[0]), np.asarray(tri[1]))))
voc_length = len(voc) 
word_to_freq = {word: i for i, word in enumerate(voc)}

#creating lists where we will store the input tensors
cont = []
tar = []
for context, target in trigrams:
    #creates a tensor with the frequency of both current context words
    context_freqs = torch.tensor([word_to_freq[word] for word in context], dtype = torch.long)
    #adds the tensor to inp
    cont.append(context_freqs)
    # does the same for the target and its frequency
    target_freq = torch.tensor([word_to_freq[target]], dtype = torch.long)
    tar.append(target_freq)

## Defining GRU class

In [18]:
class GRU(nn.Module):
    #init for input size, hidden size, output size and number of hidden layers.
    def __init__(self, input_s, hidden_s, output_s,n_layers = 1):
        super(GRU, self).__init__()
        self.input_s = input_s
        self.hidden_s = hidden_s
        self.output_s = output_s
        self.n_layers = n_layers
        # our encoder will be nn.Embedding
        # reminder: the encoder takes the input and outputs a feature tensor holding the information representing the input.
        self.encoder = nn.Embedding(input_s, hidden_s)
        #defining the GRU cell, still have to determine which parameters work best
        self.gru = nn.GRU(2*hidden_s, hidden_s, n_layers, batch_first=True, bidirectional=False)
        # defining linear decoder
        self.decoder = nn.Linear(hidden_s, output_s)

    def forward(self, input, hidden):
        #making sure that the input is a row vector
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1,-1))
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_s))


## Loading trained model
Choosing which trained model to load. They were both trained on CPU over 100 epochs using 50000 trigrams sampled randomly. 
* 1:
    * GRU model with 1 hidden layer consisting of 150 nodes. Note that since this model was trained before adding the random seed to the trigram sampling step, it is necessary to load its corresponding set of trigrams.
* 2: 
    * GRU model with 2 hidden layers consisting of 75 nodes each. Unfortunately we didn't include a random seed for this trial either, and we did not save the corresponding set of trigrams. Although the generation step might work, it is not advised to use this model.

In [19]:
def model_selection(x):
    if x ==1:
        path = os.getcwd()+'/test5_trained_inference.pt'
        hidden_s = 150
        n_layers = 1
        lr = 0.015
    if x==2:
        path = os.getcwd()+'/test4_trained_inference.pt'
        hidden_s = 75
        n_layers = 2
        lr = 0.015
    decoder = GRU(voc_length, hidden_s, voc_length, n_layers)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    decoder = torch.load(path)
    decoder.eval()
    return decoder

decoder = model_selection(1)
with open("trigrams_test5.txt", "rb") as fp:
    trigrams = pickle.load(fp)

## Loading description generation scripts

In [21]:
def next_token_generator(seed, generation_length=100):
    hidden = decoder.init_hidden()

    for p in range(generation_length):
        
        prime_input = torch.tensor([word_to_freq[w] for w in seed.split()], dtype=torch.long)
        cont = prime_input[-2:] #last two words as input
        output, hidden = decoder(cont, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).exp()
        top_choice = torch.multinomial(output_dist, 1)[0]
        
        # Add predicted word to string and use as next input
        predicted_word = list(word_to_freq.keys())[list(word_to_freq.values()).index(top_choice)]
        seed += " " + predicted_word
#         inp = torch.tensor(word_to_ix[predicted_word], dtype=torch.long)

    return seed

def gen_input_words(mw, model):
    #mw = main word
    #model = embeddings used to generate the cards

    #generating the corresponding taboo card
    card_words = cg.card_generator(mw, cg.get_gold_probdist(), model)
    #set of words that we hope will appear in the description
    input_words = card_words[mw] + [mw]

    # extending the input_words set using semantic relations. Bigger set --> better chances of generating an approved word!
    # we will use the make_semrel_dict function to get synonyms, hyponyms and hypernyms of the MW.
    # we considered adding also semrel words from the tw, but the loose connection to the MW very fast
    # we will leave out antonyms as they might make they are "riskier" to use in a description.

    adds = []
    temp = sr.make_semrel_dict(mw)
    for k in temp.keys():
        if k != 'semrel_antonym':
            new = list(temp[k])
            adds += new
    adds = np.unique(adds)
    adds = [x.lower() for x in adds]
    input_words = np.unique(input_words + adds)

    # filtering out the input words that are not in our vocab. Shouldn't be a thing when using larger corpus
    input_words = [word for word in input_words if word in voc]
    return input_words

def description_generator(mw, model, n_seeds = 3, n_iterations = 10, debugging = False, printing = False):
    #mw = main word
    #model = embeddings used to generate the cards
    #n_seeds = if we are using 2 or 3 seeds during the sentence generation step
    #n_iterations = how many iterations we will do in the generation step
    #debugging = True if we want to print some statistics about the process. False if we only want the last 5 generated sentences.
    #printing = True will print something, based on debugging. If false, it will only return the final sentence
    
    #generating the input_words we are aiming to include in our description
    input_words = gen_input_words(mw, model)    
    #on average a descriptive sentence had 27 words/symbols.
    # we will equally divide them between our seeds
    
    
    # iterate until nice sentence comes up
    # we will add safety measure to not break everything
    i = 0
    index_in_sentence = -1
    
    
    #if we are using 3 seeds
    #the 3 most frequent ones in our corpus were "x is", 'x means' and "x can be found"
    if n_seeds == 3:
        #create the first sentence
        sentence_parts = np.array([next_token_generator(mw+' means', 7), next_token_generator(mw+' is', 7), next_token_generator(mw+' can be found', 5)])
        sentence =  " ".join(sentence_parts)
        eval_sentence = sentence.split()   
    
        # to keep track of scores
        scores = np.zeros(n_iterations)
        #first score vector and score
        #and accounting for the 3 times the TW appears already in the seeds
        score_vector = np.array([eval_sentence.count(word) for word in input_words])
        score_vector[input_words.index(mw)] -= 3 
        score = np.sum(score_vector)  

        # the covered vector will take care that we don't replace a segment that we already "like"
        covered = np.array([0,0,0])
        changes = np.zeros(len(score_vector))

        #known positions of input words in our sentence
        positions = np.zeros(len(eval_sentence))

        #we know the positions of the seeds
        positions[0] = 1
        positions[9] = 1
        positions[18] = 1
        
        while i < n_iterations:
            #aware that with this flow we are doing one iteration after reaching the desired score, but it's no big deal because score is designed to only go up.

            #checking if score improved
            new_score_vector = np.array([eval_sentence.count(word) for word in input_words])
            new_score_vector[input_words.index(mw)] -= 3 
            changes = new_score_vector - score_vector

            if True in (changes>0): #there was a change. Assuming there is max 1 change per iteration from now on
                index = np.where(changes == 1)[0][0] #looking for the position in which an input_word was added
                word_that_was_added = input_words[index] #if we stop assuming that, here we have to keep track of location and magnitude of changes
                
                #finding in which segment that new added word is in order to leave the segment untouched

                #this detects the index of the word that just came up in case that word was already in our sentence
                indices_in_sentence = np.where(np.array(eval_sentence) == word_that_was_added)[0]
                if len(indices_in_sentence) >1: #word appears at least twice
                    for d in indices_in_sentence:
                        if positions[d] != 1:
                            index_in_sentence = d
                            positions[d] = 1
                else:
                    index_in_sentence = indices_in_sentence[0]
                    positions[index_in_sentence] = 1
                #keeping the segment in which the improvement took place
                if index_in_sentence in range(9) & covered[0]!=1:
                    sentence_parts[1] = next_token_generator(mw+' is', 7)
                    sentence_parts[2] = next_token_generator(mw+' can be found', 5)
                    sentence = ' '.join(sentence_parts)
                    covered[0] = 1
                elif index_in_sentence in range(9, 18) & covered[1] !=1:
                    sentence_parts[0] = next_token_generator(mw+' means', 7)
                    sentence_parts[2] = next_token_generator(mw+' can be found', 5)
                    sentence = ' '.join(sentence_parts)
                    covered[1] = 1
                elif index_in_sentence in range(18, 27) & covered[2] != 1:
                    sentence_parts[1] = next_token_generator(mw+' is', 7)
                    sentence_parts[0] = next_token_generator(mw+' means', 7)
                    sentence = ' '.join(sentence_parts)
                    covered[2] = 1
                eval_sentence = sentence.split()
                changes = np.zeros(len(score_vector))
                index_in_sentence = 0
                score_vector = new_score_vector
                score = np.sum(score_vector)

            #if there was no change
            else: #based on what is already covered
                if covered[0] ==0:
                    sentence_parts[0] = next_token_generator(mw+' means', 7) +' '
                #if the first part is already covered we can add it as input to generate the second
                if covered[1] ==0:
                    if covered[0]==1:
                        temp =  next_token_generator(sentence_parts[0]+' '+ mw+' is', 7) +' '
                        #taking off the first part from it
                        temp = temp.split()
                        sentence_parts[1] = " ".join(temp[9:])   
                    else:
                        sentence_parts[1] = next_token_generator(mw+' is', 7) +' '
                # same logic for the third part.
                if covered[2] == 0:
                    if covered[1] == 0:
                        sentence_parts[2] = next_token_generator(mw+' can be found', 5)
                    else:
                        temp =  next_token_generator(sentence_parts[1]+' '+ mw+' can be found', 5) +' '
                        #taking off the second part from it
                        temp = temp.split()
                        sentence_parts[2] = " ".join(temp[9:])
                sentence = ' '.join(sentence_parts)
                eval_sentence = sentence.split()
                score_vector = new_score_vector
                score = np.sum(score_vector)
            if printing == True:
                if debugging ==True:
                    print("Sentence number: " + str(i+1))
                    print(sentence)
                    if True in (changes>0):
                        print(changes)
                    print(covered)
                    print(positions)
                else:
                    if i in range(n_iterations-5, n_iterations):
                        print("Sentence number: " + str(i+1))
                        print(sentence)
            scores[i] = score
            i +=1
            
    #if we are using 2 seeds
    #the 2 most frequent ones in our corpus were "x is" and 'x means'
    if n_seeds == 2:
        #create the first sentence
        sentence_parts = np.array([next_token_generator(mw+' means', 11), next_token_generator(mw+' is', 12)])
        sentence =  " ".join(sentence_parts)
        eval_sentence = sentence.split()   
    
        # to keep track of scores
        scores = np.zeros(n_iterations)
        #first score vector and score
        #and accounting for the 3 times the TW appears already in the seeds
        score_vector = np.array([eval_sentence.count(word) for word in input_words])
        score_vector[input_words.index(mw)] -= 3 
        score = np.sum(score_vector)  

        # the covered vector will take care that we don't replace a segment that we already "like"
        covered = np.array([0,0])
        changes = np.zeros(len(score_vector))

        #known positions of input words in our sentence
        positions = np.zeros(len(eval_sentence))

        #we know the positions of the seeds
        positions[0] = 1
        positions[14] = 1
        
        while i < n_iterations:
            #aware that with this flow we are doing one iteration after reaching the desired score, but it's no big deal because score is designed to only go up.

            #checking if score improved
            new_score_vector = np.array([eval_sentence.count(word) for word in input_words])
            new_score_vector[input_words.index(mw)] -= 3 
            changes = new_score_vector - score_vector

            if True in (changes>0): #there was a change. Assuming there is max 1 change per iteration from now on
                index = np.where(changes == 1)[0][0] #looking for the position in which an input_word was added
                word_that_was_added = input_words[index] #if we stop assuming that, here we have to keep track of location and magnitude of changes
                
                #finding in which segment that new added word is in order to leave the segment untouched

                #this detects the index of the word that just came up in case that word was already in our sentence
                indices_in_sentence = np.where(np.array(eval_sentence) == word_that_was_added)[0]
                if len(indices_in_sentence) >1: #word appears at least twice
                    for d in indices_in_sentence:
                        if positions[d] != 1:
                            index_in_sentence = d
                            positions[d] = 1
                else:
                    index_in_sentence = indices_in_sentence[0]
                    positions[index_in_sentence] = 1
                #keeping the segment in which the improvement took place
                if index_in_sentence in range(14):
                    sentence_parts[1] = next_token_generator(mw+' is', 12)
                    sentence = ' '.join(sentence_parts)
                    covered[0] = 1
                elif index_in_sentence in range(14, 27):
                    sentence_parts[0] = next_token_generator(mw+' means', 11)
                    sentence = ' '.join(sentence_parts)
                    covered[1] = 1
                eval_sentence = sentence.split()
                changes = np.zeros(len(score_vector))
                index_in_sentence = 0
                score_vector = new_score_vector
                score = np.sum(score_vector)

            #if there was no change
            else: #based on what is already covered
                if covered[0] ==0:
                    sentence_parts[0] = next_token_generator(mw+' means', 11) +' '
                #if the first part is already covered we can add it as input to generate the second
                if covered[1] ==0:
                    if covered[0]==1:
                        temp =  next_token_generator(sentence_parts[0]+' '+ mw+' is', 12) +' '
                        #taking off the first part from it
                        temp = temp.split()
                        sentence_parts[1] = " ".join(temp[12:])   
                    else:
                        sentence_parts[1] = next_token_generator(mw+' is', 7) +' '
                sentence = ' '.join(sentence_parts)
                eval_sentence = sentence.split()
                score_vector = new_score_vector
                score = np.sum(score_vector)
            
            if printing == True:
                if debugging ==True:
                    print("Sentence number: " + str(i+1))
                    print(sentence)
                    if True in (changes>0):
                        print(changes)
                    print(covered)
                    print(positions)
                else:
                    if i in range(n_iterations-5, n_iterations):
                        print("Sentence number: " + str(i+1))
                        print(sentence)
            scores[i] = score
            i +=1
    return sentence


def sentence_cleaner(sentence, mw, model):
    #replacing MW with "the main word" and TWs appearing in the sentence with one of their synonyms
    sentence = sentence.replace(mw, 'The main word')

    #replacing any TWs appearing in our sentence with some allowed synonym
    taboo_words = cg.card_generator(mw, cg.get_gold_probdist(), model)[mw]

    spl = np.array(sentence.split())
    for tw in taboo_words:
        if tw in spl:
           #getting synonyms of detected tw
            syns = sr.get_synonyms(tw)
            if len(syns) > 0:
                syns = list(syns)
                choice = np.random.choice(syns)
                sentence = sentence.replace(tw, choice)
    return sentence

def final_output(mw, model, n_seeds = 3, n_iterations = 10, debugging = False, printing = False):
    sentence = description_generator(mw, model, n_seeds, n_iterations, debugging, printing)
    output = sentence_cleaner(sentence, mw, model)
    return output

## Example with 'cake' as main word, 3 seeds, debugging mode on to show covered and position vectors

In [22]:
final_output(mw = 'cake', model = card_model, n_seeds=3, n_iterations = 100, debugging = True, printing = True)

The set of input words we are aiming to have in the descriptive sentence consists of: ['block', 'cake', 'coat', 'cookie', 'cover', 'dessert', 'dish', 'patty', 'tablet']
Sentence number: 1
cake means monkey millennium semicolon annulment latest remaining conside cake is altered import korzybski cheating car simulating piracy  cake can be found hour dissect 2500 cheating jams
[0 0 0]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0.]
Sentence number: 2
cake means monkey inward lag divers global_objects precedent compressed  cake is seen enumbindings falcons rootstock sportsman simulate fantastic  cake can be found cron bench pro-europeans overhaul integrated
[0 0 0]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0.]
Sentence number: 3
cake means monkey sata lag enhanced semicolon unecessary flowers  cake is optimal loop rehashing reconstruction lineal kneaded disastrous  cake can be found poland logical carer tabout train


'The main word means monkey sata lag flowers center removing kcal  The main word is initial jumping rootstock acetate maltese maltese fictional  The main word can be found personalized jams secret peoples theoretically'

## Example with 'cake' as main word, 2 seeds, debugging mode on to show covered and position vectors

In [23]:
final_output(mw = 'cake', model = card_model, n_seeds=2, n_iterations = 20, debugging = True, printing = True)

The set of input words we are aiming to have in the descriptive sentence consists of: ['block', 'cake', 'coat', 'cookie', 'cover', 'dessert', 'dish', 'patty', 'tablet']
Sentence number: 1
cake means monkey miracle stall overhaul introduce sermon semicolon monkey gifts lag stall  cake is sutil 160 represents celebratory arises ramp align 
[0 0]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Sentence number: 2
cake means monkey sata lag bench cant turned cheating slightest bench cambodia semicolon  cake is 160 dolled intrusive gladiola 17:20 tomato fictional 
[0 0]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Sentence number: 3
cake means monkey sata lag flowers center undead core cheating introduce python contractor  cake is hands-on bodies churchman variety thou housebuilders amateur 
[0 0]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Sentence number: 4
cake means monkey i

'The main word means monkey gifts semicolon alphabetical notebooks illuminate intents weapon ring cheating cms  The main word is genome ferdinand monkey waste drawer statistical rainfall '

## Example with 'airplane' as main word, 3 seeds, simple printing mode

In [25]:
final_output(mw = 'airplane', model = card_model, n_seeds=3, n_iterations = 150, debugging = False, printing = True)

The set of input words we are aiming to have in the descriptive sentence consists of: ['airplane', 'fighter', 'flight', 'jet']
airplane means bald distinction width lib time maupassant cios  airplane is takeover characters 27:12 cios carrying keystone introdu airplane can be found wherever presented sizes going into
airplane means introduce variables peer introduce relief discusses m airplane is scathing jumping exaptation electricity blackout linux c airplane can be found contemptibly considered scan audiometer archiv
airplane means introduce sitaraman despair stall slatterns monkey pe airplane is think suzhou start-address inspire operates textual lee  airplane can be found searching communal grey-thompson bringing stal
airplane means monkey gifts jams removing arises cios nano  airplane is scathing heating unusual hopped dishes need lover  airplane can be found believable rhianna remaining infidel vice
airplane means introduce accident non-reactionary monkey guinea-pig  airplane is 

'The main word means introduce accident non-reactionary monkey guinea-pig  The main word is generalisation mouse spouses herald glutamine fact multi The main word can be found personalized mission excitement weblog pertine'

## Example with 'airplane' as main word, 2 seeds, only final output is shown

In [26]:
final_output(mw = 'airplane', model = card_model, n_seeds=2, n_iterations = 10, debugging = False, printing = False)

'The main word means introduce homicide contractor flowers removing poisoning miss slatterns economist cheating subsc The main word is barbara scheduling heightened rush grey-thompson latest cheating '