# Parks and Recreation Tweet Generator

### Evan Hiroshige, Kira Traynor, Mirza Ahmed

#### Running Notebook
1. Ensure that"sorted_name_all.csv" is located in the same directory as this notebook
2. Run notebook from top to bottom

In [1]:
import csv
import re
import random

In [2]:
# Constant values for character names
leslie = "Leslie Knope"
tom = "Tom Haverford"
april = "April Ludgate"
ron = "Ron Swanson"
perd = "Perd Hapley"
chris = "Chris Traeger"
jean = "Jean-Ralphio Saperstein"
characters = [leslie, tom, april, ron, perd, chris, jean]

In [3]:
def load_character(character, s_token_count):
    """
        Parses through csv file of all Parks and Rec dialogue, only keeping given characters dialogue
        Each sentence is padded with s_token_count of start and end sentence tokens
        e.g. s_token_count = 2 => [<s> <s> w1 ....  </s> </s>]
        The sentences are shuffled so the data can be broken up into training and testing sets where data is 
        evenly spread over the entire series
        Returns shuffled formatted sentences 
    """
    all_chars = open("sorted_name_all.csv", newline='')
    reader = csv.reader(all_chars, delimiter=",", quotechar='"')
    next(reader, None)
    data = []
    for row in reader:
        if row[0] == character:
            data.append(row)
    all_chars.close()

    sentences = []
    total_count = 0
    processed_count = 0
    start = ["<s>" for i in range(s_token_count)]
    end = ["</s>" for i in range(s_token_count)]
    for line in data:
        text = line[1].lower()
        text = text.split()
        clean_text = []
        total_count += len(text)
        for word in text:
            no_grammar = re.sub('[^A-Za-z0-9]+', '', word)
            if len(no_grammar) == 0:
                continue
            clean_text.append(no_grammar)
        sentence = start + clean_text + end
        sentences.append(sentence)
    random.seed(0)
    random.shuffle(sentences)
    return sentences

In [4]:
# Create training/testing files for statistical model

NGRAM = 4

def create_data_txt(data, character):
    """
        Creates training and testing data for a statistical model
    """
    train = data[:2*len(data)//3]
    test = data[2*len(data)//3:]
    f = open(f'fourgram-{character}-train.txt', "w")
    for sentence in train:
        line = " ".join(sentence)
        f.write(line + '\n')
    f.close
    f = open(f'fourgram-{character}-test.txt', "w")
    for sentence in test:
        line = " ".join(sentence)
        f.write(line + '\n')
    f.close
    
    
def create_statistical_files():
    for character in characters:
        data = load_character(character, NGRAM - 1)
        create_data_txt(data, character)
        
create_statistical_files()

# RNN Language Model

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN, LSTM
from keras.layers import Embedding
from keras.metrics import TopKCategoricalAccuracy
from keras.initializers import Constant
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import numpy as np 

def get_tokenizer_and_encoded(data):
    """ 
        Creates a tokenizer and fits it on the given data
        Encodes data with tokenizer
        Returns tokenizer and encoded data
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    encoded = tokenizer.texts_to_sequences(data)
    return (tokenizer, encoded)

In [6]:

def get_training_data(tokenizer, encoded):
    """
        Creates training and testing data from encoded sentences
        Breaks up sentences into sequences from w1:wN for N=1 => N=len(sentence)
        All sequences padded with zeroes to length = max length of sentence
        Training data is 75% of data, testing data 25%
        Returns (trainX, trainY, testX, testY)
    """
    x = []
    y = []

    vocab_size = len(tokenizer.word_index) + 1
    for i in range(len(encoded)):
        encoded_sent = encoded[i]
        for k in range(1, len(encoded_sent)):
            x.append(encoded_sent[:k])
            y.append(encoded_sent[k])

    maxlen = max([len(sent) for sent in x])
    x = np.array([pad_sequences([sent], maxlen=maxlen, padding='pre')[0] for sent in x])
    y = np.array(y)
    y = to_categorical(y, num_classes=vocab_size)

    trainX = x[:3*len(x)//4]
    trainY = y[:3*len(x)//4]
    testX = x[3*len(y)//4:]
    testY = y[3*len(y)//4:]
    return (trainX, trainY, testX, testY)



In [7]:
def create_model(vocab_size, maxlen):
    """
        Creates RNN model with Embedding, LSTM, and softmax layers
    """
    model = Sequential()
    model.add(Embedding(vocab_size, 200, input_length=maxlen))
    model.add(LSTM(400))
    model.add(Dense(vocab_size, activation='softmax')) 
    return model

In [8]:
def train_model(trainX, trainY, model):
    """
        Trains given model on given data
    """
    acc = TopKCategoricalAccuracy(k=5)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[acc, "accuracy"])
    model.fit(trainX, trainY, epochs=25, batch_size=256)
    return model

In [9]:
def index_to_word(tokenizer):
    """
        Creates dictionary mapping word index to word using given tokenizer
    """
    index_to_word_dict = {}
    for word, index in tokenizer.word_index.items():
        index_to_word_dict[index] = word
    return index_to_word_dict

def generate_text(seed, sentence_length, model, maxlen, n, tokenizer, index_to_word):
    """ 
        Generates n sentences with max length sentence_length. Uses given seed to begin generating words. 
        Returns sentences
    """
    output = []
    while len(output) < n:
        seed_text = seed
        for _ in range(sentence_length):
            tokenized_seed = [tokenizer.word_index[word] for word in seed_text.split()]
            tokenized_seed = pad_sequences([tokenized_seed], maxlen=maxlen, padding='pre')
            prediction  = model.predict([tokenized_seed])[0]
            index = np.random.choice(len(prediction), p=prediction)
            
            if index == 0:
                continue
            
            predicted_word = index_to_word[index]
            seed_text += " " + predicted_word
            if predicted_word == "</s>":
                break
            if len(seed_text) > 147:
                break
        if len(seed_text) < 15:
            continue
        output.append(seed_text)
    return output

In [10]:
def save(character, sentences):
    """
        Saves generated sentences to file for specific character
    """
    f = open(f'{character}-rnn-sentences.txt', "w")
    for sentence in sentences:
        f.write(sentence+'\n')
    f.close()

def create_and_train_model(character):
    """
        Loads data, trains model, and generates 50 sentences for specific character
    """
    print("CHARACTER:", character)
    
    # Create necessary variabled
    data = load_character(character, 1)
    tokenizer, encoded = get_tokenizer_and_encoded(data)
    vocab_size = len(tokenizer.word_index) + 1
    trainX, trainY, testX, testY = get_training_data(tokenizer, encoded)
    maxlen = len(trainX[0])
    
    # Train model
    model = create_model(vocab_size, maxlen)
    train_model(trainX, trainY, model)
    
    # Test model
    results = model.evaluate(testX, testY, batch_size=256)
    print(f'Results - Loss: {results[0]}, Top-K Accuracy: {results[1]}, Accuracy:{results[2]} ')
    
    # Generate sentences
    index_to_word_dict = index_to_word(tokenizer)
    text = generate_text("<s>", 40, model, maxlen, 50, tokenizer, index_to_word_dict)
    save(character, text)
    print(text)

In [None]:
# Intentionally breaking up model creation and training for each character 
# Easier to read and interpret the results
create_and_train_model(leslie)

CHARACTER: Leslie Knope
Epoch 1/25
Epoch 2/25

In [None]:
create_and_train_model(tom)

In [None]:
create_and_train_model(april)

In [None]:
create_and_train_model(ron)

In [None]:
create_and_train_model(perd)

In [None]:
create_and_train_model(chris)

In [None]:
create_and_train_model(jean)

# Statistical Model

In [None]:
from statistical_model import LanguageModel, test_model

def create_and_train_stat_model(character):
    """
        Builds and trains statistical 4-gram language model for given character
        Tests model, calcualtes probaility and std dev, generates 50 sentences
    """
    lm = LanguageModel(NGRAM, True)
    lm.train(f'fourgram-{character}-train.txt')
    test_model(lm, f'fourgram-{character}-test.txt', character)

In [None]:
# Again, separated each model into own cell to improve readability
create_and_train_stat_model(leslie)

In [None]:
create_and_train_stat_model(tom)

In [None]:
create_and_train_stat_model(april)

In [None]:
create_and_train_stat_model(perd)

In [None]:
create_and_train_stat_model(ron)

In [None]:
create_and_train_stat_model(chris)

In [None]:
create_and_train_stat_model(jean)