# TV script generator

Generate TV script from the Simpsons.

This is longer than the original as I've added intermediate steps. Helper functions are incorporated here rather than being in a separate file. 

In [3]:
import os
import pickle
from collections import Counter

import numpy as np

### Get data

In [7]:
data_file = './data/simpsons/moes_tavern_lines.txt'

with open(data_file) as inf:
     text= inf.read()

# Ignore the "notice" section
text = text[81:]
text[:100]

"Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.\nBart_Simpson: Eh, yeah, hello"

### Explore different parts of the data

In [9]:
view_sentence_range = (0, 10)

In [17]:
# original:
#num_unique_words = len({word: None for word in text.split()})
#print('Roughly the number of unique words: {}'.format()

## Here we use a counter to find the size of the vocab

wc = Counter(text.lower().split())

print('Vocab:', len(wc))
print('Total words:', sum(wc.values()))
print('Common words:', wc.most_common(10), '...')

Vocab: 10347
Total words: 48974
Common words: [('the', 1276), ('i', 1242), ('moe_szyslak:', 1180), ('you', 1069), ('a', 1043), ('homer_simpson:', 975), ('to', 847), ('and', 606), ('of', 480), ('my', 467)] ...


In [18]:
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))

Number of scenes: 262


In [19]:
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

Average number of sentences in each scene: 15.248091603053435


In [20]:
sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))

Number of lines: 4257


In [21]:
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

Average number of words in each line: 11.50434578341555


In [24]:
print()
print('Sentence {} to {}:'.format(*view_sentence_range))
print()
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))


Sentence 0 to 10:

Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




### Convert vocab to integer representation

In [38]:
# Create vocab_to_int and int_to_vocab.
# This also sorts from most frequent to least frequent.
# For example, integer 0 represents the most frequently used vocab.

vocab_to_int = {}
int_to_vocab = {}

for i, (word, cnt) in enumerate(wc.most_common()):
    vocab_to_int[word] = i
    int_to_vocab[i] = word

In [40]:
int_to_vocab

{0: 'the',
 1: 'i',
 2: 'moe_szyslak:',
 3: 'you',
 4: 'a',
 5: 'homer_simpson:',
 6: 'to',
 7: 'and',
 8: 'of',
 9: 'my',
 10: 'in',
 11: 'that',
 12: 'is',
 13: 'this',
 14: "i'm",
 15: 'your',
 16: 'for',
 17: 'it',
 18: 'lenny_leonard:',
 19: 'me',
 20: 'on',
 21: 'with',
 22: 'all',
 23: 'just',
 24: 'hey,',
 25: 'what',
 26: 'barney_gumble:',
 27: 'but',
 28: 'like',
 29: 'be',
 30: 'carl_carlson:',
 31: "don't",
 32: "it's",
 33: 'have',
 34: 'got',
 35: 'get',
 36: 'we',
 37: 'was',
 38: "that's",
 39: 'oh,',
 40: 'are',
 41: "you're",
 42: 'so',
 43: 'can',
 44: 'yeah,',
 45: 'do',
 46: 'not',
 47: 'well,',
 48: 'at',
 49: '/',
 50: 'out',
 51: 'how',
 52: 'one',
 53: 'no',
 54: 'if',
 55: 'uh,',
 56: 'marge_simpson:',
 57: 'about',
 58: 'gonna',
 59: '--',
 60: 'up',
 61: 'homer,',
 62: 'he',
 63: 'they',
 64: 'moe,',
 65: 'who',
 66: 'an',
 67: 'moe.',
 68: 'oh',
 69: 'his',
 70: 'from',
 71: 'now',
 72: 'some',
 73: 'know',
 74: 'little',
 75: 'as',
 76: "i'll",
 77: 'here'

In [39]:
vocab_to_int

{'the': 0,
 'i': 1,
 'moe_szyslak:': 2,
 'you': 3,
 'a': 4,
 'homer_simpson:': 5,
 'to': 6,
 'and': 7,
 'of': 8,
 'my': 9,
 'in': 10,
 'that': 11,
 'is': 12,
 'this': 13,
 "i'm": 14,
 'your': 15,
 'for': 16,
 'it': 17,
 'lenny_leonard:': 18,
 'me': 19,
 'on': 20,
 'with': 21,
 'all': 22,
 'just': 23,
 'hey,': 24,
 'what': 25,
 'barney_gumble:': 26,
 'but': 27,
 'like': 28,
 'be': 29,
 'carl_carlson:': 30,
 "don't": 31,
 "it's": 32,
 'have': 33,
 'got': 34,
 'get': 35,
 'we': 36,
 'was': 37,
 "that's": 38,
 'oh,': 39,
 'are': 40,
 "you're": 41,
 'so': 42,
 'can': 43,
 'yeah,': 44,
 'do': 45,
 'not': 46,
 'well,': 47,
 'at': 48,
 '/': 49,
 'out': 50,
 'how': 51,
 'one': 52,
 'no': 53,
 'if': 54,
 'uh,': 55,
 'marge_simpson:': 56,
 'about': 57,
 'gonna': 58,
 '--': 59,
 'up': 60,
 'homer,': 61,
 'he': 62,
 'they': 63,
 'moe,': 64,
 'who': 65,
 'an': 66,
 'moe.': 67,
 'oh': 68,
 'his': 69,
 'from': 70,
 'now': 71,
 'some': 72,
 'know': 73,
 'little': 74,
 'as': 75,
 "i'll": 76,
 'here': 77

In [49]:
## make it into a function suitable for the original project

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    wc = Counter(text)
    
    vocab_to_int = {}
    int_to_vocab = {}
    
    for i, (word, cnt) in enumerate(wc.most_common()):
        vocab_to_int[word] = i
        int_to_vocab[i] = word
        
    return vocab_to_int, int_to_vocab

In [50]:
## quick test

text = "today is an interesting day, is it not?"
text_tokens = text.lower().split()
v2i, i2v = create_lookup_tables(text_tokens)

In [51]:
v2i

{'an': 2, 'day,': 4, 'interesting': 3, 'is': 0, 'it': 5, 'not?': 6, 'today': 1}

In [52]:
i2v

{0: 'is', 1: 'today', 2: 'an', 3: 'interesting', 4: 'day,', 5: 'it', 6: 'not?'}

In [53]:
## test with given unit test

import problem_unittests as tests
tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


## tokenize punctuations

We need to distinguish between "bye" and "bye!".

So we will translate punctuations to tokens. For example, change "!" into "||Exclamation_Mark||". 

* Period ( . )
* Comma ( , )
* Quotation Mark ( " )
* Semicolon ( ; )
* Exclamation mark ( ! )
* Question mark ( ? )
* Left Parentheses ( ( )
* Right Parentheses ( ) )
* Dash ( -- )
* Return ( \n )

In [64]:
## the string package has a pre-defined list of punctuations

import string
type(string.punctuation), string.punctuation

(str, '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

In [58]:
for p in string.punctuation:
    print(p, '-->', '||'+p+'||')

! --> ||!||
" --> ||"||
# --> ||#||
$ --> ||$||
% --> ||%||
& --> ||&||
' --> ||'||
( --> ||(||
) --> ||)||
* --> ||*||
+ --> ||+||
, --> ||,||
- --> ||-||
. --> ||.||
/ --> ||/||
: --> ||:||
; --> ||;||
< --> ||<||
= --> ||=||
> --> ||>||
? --> ||?||
@ --> ||@||
[ --> ||[||
\ --> ||\||
] --> ||]||
^ --> ||^||
_ --> ||_||
` --> ||`||
{ --> ||{||
| --> |||||
} --> ||}||
~ --> ||~||


In [72]:
## create a dictionary according to project

def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    #import string
    punct_to_token = {'.':'||PERIOD||', 
                      ',':'||COMMA||', 
                      '"':'||QUOTE||', 
                      ';':'||SEMICOLON||', 
                      '!':'||EXCMARK||', 
                      '?':'||Q||', 
                      '(':'||OPENP||', 
                      ')':'||CLOSEP||', 
                      '--':'||DASH||',
                      '\n':'||NEWLINE||'}
    return punct_to_token

In [75]:
# quick test
token_dict = token_lookup()
token_dict

{'\n': '||NEWLINE||',
 '!': '||EXCMARK||',
 '"': '||QUOTE||',
 '(': '||OPENP||',
 ')': '||CLOSEP||',
 ',': '||COMMA||',
 '--': '||DASH||',
 '.': '||PERIOD||',
 ';': '||SEMICOLON||',
 '?': '||Q||'}

In [74]:
## test with built-in test
tests.test_tokenize(token_lookup)

Tests Passed


## Preprocess data

In [76]:
for key, token in token_dict.items():
    text = text.replace(key, ' {} '.format(token))

text = text.lower()
text = text.split()

vocab_to_int, int_to_vocab = create_lookup_tables(text)
int_text = [vocab_to_int[word] for word in text]

with open('preprocess.p', 'wb') as outf:
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), outf)

In [78]:
## check current directory
os.listdir('.')

['preprocess.p',
 '.ipynb_checkpoints',
 'tv_script_generator.ipynb',
 '__pycache__',
 'problem_unittests.py',
 'data']

# Check Point
Simply reload saved files from this point forward.

In [79]:
with open('preprocess.p', mode='rb') as inf:
    int_text, vocab_to_int, int_to_vocab, token_dict = \
        pickle.load(inf)
        
len(int_text), len(vocab_to_int), len(int_to_vocab), len(token_dict)

(10, 9, 9, 10)

## Build NN