In [7]:
import csv
import re
import random

In [8]:
# Constant values for character names
leslie = "Leslie Knope"
tom = "Tom Haverford"
april = "April Ludgate"
ron = "Ron Swanson"
perd = "Perd Hapley"
chris = "Chris Traeger"
jean = "Jean-Ralphio Saperstein"
characters = [leslie, tom, april, ron, perd, chris, jean]

In [9]:
def load_character(character, s_token_count):
    """
        Parses through csv file of all Parks and Rec dialogue, only keeping given characters dialogue
        Each sentence is padded with s_token_count of start and end sentence tokens
        e.g. s_token_count = 2 => [<s> <s> w1 ....  </s> </s>]
        The sentences are shuffled so the data can be broken up into training and testing sets where data is 
        evenly spread over the entire series
        Returns shuffled formatted sentences 
    """
    all_chars = open("sorted_name_all.csv", newline='')
    reader = csv.reader(all_chars, delimiter=",", quotechar='"')
    next(reader, None)
    data = []
    for row in reader:
        if row[0] == character:
            data.append(row)
    all_chars.close()

    sentences = []
    total_count = 0
    processed_count = 0
    start = ["<s>" for i in range(s_token_count)]
    end = ["</s>" for i in range(s_token_count)]
    for line in data:
        text = line[1].lower()
        text = text.split()
        clean_text = []
        total_count += len(text)
        for word in text:
            no_grammar = re.sub('[^A-Za-z0-9]+', '', word)
            if len(no_grammar) == 0:
                continue
            clean_text.append(no_grammar)
        sentence = start + clean_text + end
        sentences.append(sentence)
    random.seed(0)
    random.shuffle(sentences)
    return sentences

In [10]:
# Create training/testing files for statistical model

NGRAM = 4

def create_data_txt(data, character):
    """
        Creates training and testing data for a statistical model
    """
    train = data[:2*len(data)//3]
    test = data[2*len(data)//3:]
    f = open(f'fourgram-{character}-train.txt', "w")
    for sentence in train:
        line = " ".join(sentence)
        f.write(line + '\n')
    f.close
    f = open(f'fourgram-{character}-test.txt', "w")
    for sentence in test:
        line = " ".join(sentence)
        f.write(line + '\n')
    f.close
    
    
def create_statistical_files():
    for character in characters:
        data = load_character(character, NGRAM - 1)
        create_data_txt(data, character)
        
create_statistical_files()

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN, LSTM
from keras.layers import Embedding
from keras.metrics import TopKCategoricalAccuracy
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import numpy as np 

def get_tokenizer_and_encoded(data):
    """ 
        Creates a tokenizer and fits it on the given data
        Encodes data with tokenizer
        Returns tokenizer and encoded data
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    encoded = tokenizer.texts_to_sequences(data)
    return (tokenizer, encoded)

In [12]:

def get_training_data(tokenizer, encoded):
    """
        Creates training and testing data from encoded sentences
        Breaks up sentences into sequences from w1:wN for N=1 => N=len(sentence)
        All sequences padded with zeroes to length = max length of sentence
        Training data is 75% of data, testing data 25%
        Returns (trainX, trainY, testX, testY)
    """
    x = []
    y = []

    vocab_size = len(tokenizer.word_index) + 1
    for i in range(len(encoded)):
        encoded_sent = encoded[i]
        for k in range(1, len(encoded_sent)):
            x.append(encoded_sent[:k])
            y.append(encoded_sent[k])

    maxlen = max([len(sent) for sent in x])
    x = np.array([pad_sequences([sent], maxlen=maxlen, padding='pre')[0] for sent in x])
    y = np.array(y)
    y = to_categorical(y, num_classes=vocab_size)

    trainX = x[:3*len(x)//4]
    trainY = y[:3*len(x)//4]
    testX = x[3*len(y)//4:]
    testY = y[3*len(y)//4:]
    return (trainX, trainY, testX, testY)



In [13]:
def create_model(vocab_size, maxlen):
    """
        Creates RNN model with Embedding, LSTM, and softmax layers
    """
    model = Sequential()
    model.add(Embedding(vocab_size, 200, input_length=maxlen))
    model.add(LSTM(400))
    model.add(Dense(vocab_size, activation='softmax')) 
    return model

In [48]:
def train_model(trainX, trainY, model):
    """
        Trains given model on given data
    """
    acc = TopKCategoricalAccuracy(k=5)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[acc, "accuracy"])
    model.fit(trainX, trainY, epochs=25, batch_size=256)
    return model

In [26]:
def index_to_word(tokenizer):
    index_to_word_dict = {}
    for word, index in tokenizer.word_index.items():
        index_to_word_dict[index] = word
    return index_to_word_dict

def generate_text(seed, max_words, model, maxlen, n, tokenizer, index_to_word):
    """ 
        Generates n sentences with max length max_words. Uses given seed to begin generating words. 
        Returns sentences
    """
    output = []
    while len(output) < n:
        seed_text = seed
        for _ in range(max_words):
            token_list = [tokenizer.word_index[word] for word in seed_text.split()]
            token_list = pad_sequences([token_list], maxlen=maxlen, padding='pre')
            prediction  = model.predict([token_list])[0]
            index = np.random.choice(len(prediction), p=prediction)
            
            if index == 0:
                continue
            
            predicted_word = index_to_word[index]
            seed_text += " " + predicted_word
            if predicted_word == "</s>":
                break
            if len(seed_text) > 147:
                break
        if len(seed_text) < 15:
            continue
        output.append(seed_text)
    return output

In [33]:
def save(character, sentences):
    """
        Saves generated sentences to file for specific character
    """
    f = open(f'{character}-rnn-sentences.txt', "w")
    for sentence in sentences:
        f.write(sentence+'\n')
    f.close()

def create_and_train_model(character):
    """
        Loads data, trains model, and generates 50 sentences for specific character
    """
    print("CHARACTER:", character)
    
    data = load_character(character, 1)
    tokenizer, encoded = get_tokenizer_and_encoded(data)
    vocab_size = len(tokenizer.word_index) + 1
    trainX, trainY, testX, testY = get_training_data(tokenizer, encoded)
    maxlen = len(trainX[0])
    
    model = create_model(vocab_size, maxlen)
    train_model(trainX, trainY, model)
    
    results = model.evaluate(testX, testY, batch_size=256)
    print(f'Results - Loss: {results[0]}, Top-K Accuracy: {results[1]}, Accuracy:{results[2]} ')
    
    
    index_to_word_dict = index_to_word(tokenizer)
    text = generate_text("<s>", 40, model, maxlen, 50, tokenizer, index_to_word_dict)
    
    save(character, text)
    name = character.split()
    model.save(name[0])
    print(text)

In [32]:
# Intentionally breaking up model creation and training for each character 
# Easier to understand the results
create_and_train_model(leslie)

CHARACTER: Leslie Knope


KeyboardInterrupt: 

In [None]:
create_and_train_model(tom)

In [None]:
create_and_train_model(april)

In [None]:
create_and_train_model(ron)

In [34]:
create_and_train_model(perd)

CHARACTER: Perd Hapley
Epoch 1/2
Epoch 2/2




Results - Loss: 5.786198616027832, Top-K Accuracy: 0.2222222238779068, Accuracy:0.0958605632185936 




INFO:tensorflow:Assets written to: Perd/assets


INFO:tensorflow:Assets written to: Perd/assets


['<s> is ya been i that one didnt you the show thing leslie almost horse of piece and jen gonna is called defendant is box real sense </s>', '<s> to you at one is is show having will there with department deputy we to some the an greatest begins films a two been what like well ive a our a heard', '<s> tonight not a unconfirmed is with hosting story points thats im </s>', '<s> and vote a talking story and to i evening you four 35 begin watering microphone </s>', '<s> half joke department show </s>', '<s> strong well </s>', '<s> word seconds return this knope right this a jamm is misplaced i tonight stop story is define a a </s>', '<s> first ive a a </s>', '<s> of which unconfirmed </s>', '<s> and of name story at knope well a our </s>', '<s> one going the question a brandi gokart whats did gonna abstinence </s>', '<s> show to box revelation in the wont knope two we that the also break go im trying go jobs to and that are your the </s>', '<s> adult you dance the </s>', '<s> 200 et i situ

In [None]:
create_and_train_model(chris)

In [None]:
create_and_train_model(jean)

In [49]:
from statistical_model import LanguageModel, test_model

In [50]:
def create_and_train_stat_model(character):
    lm = LanguageModel(4, True)
    lm.train(f'fourgram-{character}-train.txt')
    test_model(lm, f'fourgram-{character}-test.txt', character)

In [51]:
create_and_train_stat_model(leslie)

Model: Leslie Knope
Sentences:
<s> <s> <s> no you have to be better </s> </s> </s>
<s> <s> <s> and you have every right to be but were hoping what were gonna do this </s> </s> </s>
<s> <s> <s> youve never had a budget shortage </s> </s> </s>
<s> <s> <s> yeah </s> </s> </s>
<s> <s> <s> ann you <UNK> bastard </s> </s> </s>
<s> <s> <s> i knew it </s> </s> </s>
<s> <s> <s> the interview is back on </s> </s> </s>
<s> <s> <s> okay we have a new item up for bid is two <UNK> vip passes to the unity concert </s> </s> </s>
<s> <s> <s> no more <UNK> in modern society and theyre embarrassing to pawnee </s> </s> </s>
<s> <s> <s> i dont really trust these guys </s> </s> </s>
<s> <s> <s> sorry </s> </s> </s>
<s> <s> <s> i wouldnt wish it on my <UNK> i held the position of deputy director of parks and recreation department </s> </s> </s>
<s> <s> <s> okay </s> </s> </s>
<s> <s> <s> just one item jennifer </s> </s> </s>
<s> <s> <s> this is a little fun i was having you know </s> </s> </s>
<s> <s> <s> th

In [52]:
create_and_train_stat_model(tom)

Model: Tom Haverford
Sentences:
<s> <s> <s> well jerry what the hell </s> </s> </s>
<s> <s> <s> we gotta send this dude off with the perfect gift </s> </s> </s>
<s> <s> <s> are we saying <UNK> anytime we think <UNK> dope </s> </s> </s>
<s> <s> <s> listening to that tree lighting is gonna be fine </s> </s> </s>
<s> <s> <s> okay </s> </s> </s>
<s> <s> <s> yeah jessica is a gold digger digger </s> </s> </s>
<s> <s> <s> what the hell jerry </s> </s> </s>
<s> <s> <s> no </s> </s> </s>
<s> <s> <s> i own my own restaurant and several other properties </s> </s> </s>
<s> <s> <s> <UNK> <UNK> <UNK> <UNK> martin and bruno mars </s> </s> </s>
<s> <s> <s> ill think of ideas in the tree </s> </s> </s>
<s> <s> <s> <UNK> a phone that smells good </s> </s> </s>
<s> <s> <s> <UNK> </s> </s> </s>
<s> <s> <s> they had a <UNK> </s> </s> </s>
<s> <s> <s> <UNK> you want </s> </s> </s>
<s> <s> <s> tom <UNK> in the <UNK> </s> </s> </s>
<s> <s> <s> nice job man </s> </s> </s>
<s> <s> <s> what can i say </s> </s> 

In [42]:
create_and_train_stat_model(april)

Model: April Ludgate
Sentences:
<s> <s> <s> you <UNK> me <UNK> i swear to god my arms cant move that way </s> </s> </s>
<s> <s> <s> check your <UNK> </s> </s> </s>
<s> <s> <s> yeah he does being a <UNK> and a three car <UNK> </s> </s> </s>
<s> <s> <s> because this prom reminds me that you would be the worst person ive ever met </s> </s> </s>
<s> <s> <s> i just heard one <UNK> <UNK> </s> </s> </s>
<s> <s> <s> yeah i just had a gut feeling that it wasnt right for me then nothing is </s> </s> </s>
<s> <s> <s> no </s> </s> </s>
<s> <s> <s> your kids are like <UNK> awesome </s> </s> </s>
<s> <s> <s> i think youre fine </s> </s> </s>
<s> <s> <s> yay </s> </s> </s>
<s> <s> <s> how does this sound officer dwyer pawnee <UNK> </s> </s> </s>
<s> <s> <s> fine then ill make out with three <UNK> <UNK> make out with him when im drunk sometimes </s> </s> </s>
<s> <s> <s> god shes always just leaving and not telling me </s> </s> </s>
<s> <s> <s> is it like a <UNK> how long it takes to get here </s> </s

In [43]:
create_and_train_stat_model(perd)

Model: Perd Hapley
Sentences:
<s> <s> <s> <UNK> <UNK> would you <UNK> <UNK> </s> </s> </s>
<s> <s> <s> the story of this story <UNK> even more <UNK> </s> </s> </s>
<s> <s> <s> <UNK> <UNK> </s> </s> </s>
<s> <s> <s> and its razor <UNK> and <UNK> and even <UNK> glass </s> </s> </s>
<s> <s> <s> theyre <UNK> theyre <UNK> but now theyre <UNK> <UNK> at the <UNK> little <UNK> <UNK> </s> </s> </s>
<s> <s> <s> and our <UNK> poll <UNK> to <UNK> <UNK> in the pawnee <UNK> the <UNK> </s> </s> </s>
<s> <s> <s> the story of this situation is its <UNK> <UNK> </s> </s> </s>
<s> <s> <s> <UNK> </s> </s> </s>
<s> <s> <s> and leslie knope closing statement </s> </s> </s>
<s> <s> <s> <UNK> show <UNK> now </s> </s> </s>
<s> <s> <s> the story of this story <UNK> even more <UNK> </s> </s> </s>
<s> <s> <s> there are some <UNK> that id like to ask you about the <UNK> <UNK> </s> </s> </s>
<s> <s> <s> there is <UNK> a thing as <UNK> integrity and it is <UNK> that i have as a <UNK> with integrity </s> </s> </s>
<s>

In [44]:
create_and_train_stat_model(ron)

Model: Ron Swanson
Sentences:
<s> <s> <s> shes a <UNK> a bitch </s> </s> </s>
<s> <s> <s> lets get to the <UNK> </s> </s> </s>
<s> <s> <s> here you are </s> </s> </s>
<s> <s> <s> i just gave her the day off </s> </s> </s>
<s> <s> <s> and yes i will absolutely go back to get my shoes <UNK> soon </s> </s> </s>
<s> <s> <s> another stupid government rule </s> </s> </s>
<s> <s> <s> spending the day outside alone sounds like a dream </s> </s> </s>
<s> <s> <s> you know what makes a good person does something bad they own up to it </s> </s> </s>
<s> <s> <s> i just <UNK> there <UNK> breathing </s> </s> </s>
<s> <s> <s> lets go son </s> </s> </s>
<s> <s> <s> and soon itll have to </s> </s> </s>
<s> <s> <s> <UNK> </s> </s> </s>
<s> <s> <s> nice </s> </s> </s>
<s> <s> <s> i think youre being hard on yourself </s> </s> </s>
<s> <s> <s> tommy boy </s> </s> </s>
<s> <s> <s> its <UNK> </s> </s> </s>
<s> <s> <s> i dont know what you did to em but it worked like <UNK> </s> </s> </s>
<s> <s> <s> why </s>

In [45]:
create_and_train_stat_model(chris)

Model: Chris Traeger
Sentences:
<s> <s> <s> get in on this </s> </s> </s>
<s> <s> <s> was that ann </s> </s> </s>
<s> <s> <s> im going to do </s> </s> </s>
<s> <s> <s> you know the old chris wouldve loved this super <UNK> health <UNK> and i feel like we should ask for an <UNK> to stay here </s> </s> </s>
<s> <s> <s> let me take you there and you can talk to the <UNK> of whats going on with me and <UNK> daughter and i thought id <UNK> up and say hi </s> </s> </s>
<s> <s> <s> leslie and ben are engaged </s> </s> </s>
<s> <s> <s> im so excited to be working with you all again </s> </s> </s>
<s> <s> <s> and did you use it </s> </s> </s>
<s> <s> <s> you caught me before my first run of the day </s> </s> </s>
<s> <s> <s> i know </s> </s> </s>
<s> <s> <s> i know who im gonna use to <UNK> <UNK> and my <UNK> is the size of a <UNK> <UNK> to her and tell her i will not break it a <UNK> </s> </s> </s>
<s> <s> <s> that said the show was pretty <UNK> </s> </s> </s>
<s> <s> <s> you know the meeting t

In [46]:
create_and_train_stat_model(jean)

Model: Jean-Ralphio Saperstein
Sentences:
<s> <s> <s> <UNK> </s> </s> </s>
<s> <s> <s> that should be enough right </s> </s> </s>
<s> <s> <s> im gonna have a <UNK> <UNK> </s> </s> </s>
<s> <s> <s> i heard <UNK> of <UNK> </s> </s> </s>
<s> <s> <s> what do you say you and i get <UNK> in a <UNK> way </s> </s> </s>
<s> <s> <s> i cant do this <UNK> you </s> </s> </s>
<s> <s> <s> <UNK> should be coming to us </s> </s> </s>
<s> <s> <s> this guy <UNK> in </s> </s> </s>
<s> <s> <s> six gs </s> </s> </s>
<s> <s> <s> what the <UNK> is but when you <UNK> it can you take care of it for us </s> </s> </s>
<s> <s> <s> uhoh </s> </s> </s>
<s> <s> <s> <UNK> with <UNK> </s> </s> </s>
<s> <s> <s> no way </s> </s> </s>
<s> <s> <s> and those guys are <UNK> out <UNK> <UNK> with my <UNK> </s> </s> </s>
<s> <s> <s> that is unbelievable but listen to me listen to me </s> </s> </s>
<s> <s> <s> you <UNK> to stop by </s> </s> </s>
<s> <s> <s> <UNK> it </s> </s> </s>
<s> <s> <s> that makes <UNK> </s> </s> </s>
<s> 