<a href="https://colab.research.google.com/github/guru3/the_office_series_analysis/blob/master/The%20Office%20Transcript%20Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle
import random
import sys
import keras
import numpy as np
from keras.layers import LSTM, Dense
from keras.models import Sequential

Using TensorFlow backend.


In [0]:
[season_map_parsed, season_map_cleaned, theOfficeIMDBRating] = pickle.load(open('./the_office_transcript.pickle', 'rb'))

#### Let's write a dialogue generator for Michael, Dwight and Creed!

In [0]:
CHARACTERS = ['MICHAEL', 'DWIGHT', 'CREED']
chr_dialogue_map = {};
for char in CHARACTERS:
    chr_dialogue_map[char] = []

for season in season_map_parsed.keys():
    episodes = season_map_parsed[season]
    for episode in episodes.keys():
        dialogues = episodes[episode]
        for dialogue in dialogues:
            char = dialogue[0]
            if not char in CHARACTERS:
                continue;
            d = dialogue[1]
            chr_dialogue_map[ char ].append(d);

In [0]:
maxlen = 50 #length of input sequence
step = 3    #sample a new sequence after every step characters

In [0]:
def getDataForCharacter( char ):
    sentences = []   #input 
    next_chars = []  #output

    dialogues = chr_dialogue_map[char];
    for dialogue in dialogues:
        for i in range(0, len(dialogue) - maxlen, step):
            sentences.append( dialogue[i: i+maxlen] )
            next_chars.append( dialogue[i+maxlen] )
    chars = sorted(list(set(' '.join(dialogues))))
    char_indices = dict((char, chars.index(char)) for char in chars)
    x = np.zeros( (len(sentences), maxlen, len(chars)), dtype=np.bool )
    y = np.zeros( (len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t,char in enumerate(sentence):
            x[i, t, char_indices[char] ] = 1
        y[i, char_indices[next_chars[i]]] = 1
    return x,y, char_indices, dialogues

In [0]:
def getModelLSTM(chars):
    model = Sequential()
    model.add( LSTM(256, return_sequences=True, input_shape=(maxlen, len(chars))))
    model.add( LSTM(128))
    model.add( Dense(len(chars), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adaDelta')
    return model

In [0]:
def sample(preds, temperature=1.0):
    preds=  np.asarray(preds).astype('float64')
    preds = np.log(preds)/temperature
    exp_preds = np.exp(preds) #can use np.power too instead of log and exp
    preds = exp_preds/np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

In [0]:
def runForCharacter( charName ):
    x, y, char_indices, dialogues = getDataForCharacter( charName );
    chars = list(char_indices.keys())
    model = getModelLSTM(chars);
    
    for i in range(6):
      model.fit(x, y, batch_size=128, epochs=20)

      while(True):
          dialogue = dialogues[random.randint(0, len(dialogues))]
          if len(dialogue) <= maxlen:
              continue
          start_index = random.randint(0, len(dialogue) - maxlen - 1)
          break
      
      for temperature in [0.2, 0.5, 1.0, 1.2]:
          generated_text = dialogue[start_index: start_index + maxlen]
          print('--- Generating with seed: "' + generated_text + '"')
          print('------ temperature:', temperature)
          sys.stdout.write(generated_text)

          # We generate 400 characters
          for i in range(400):
              sampled = np.zeros((1, maxlen, len(chars)))
              for t, char in enumerate(generated_text):
                  sampled[0, t, char_indices[char]] = 1.

              preds = model.predict(sampled, verbose=0)[0]
              next_index = sample(preds, temperature)
              next_char = chars[next_index]

              generated_text += next_char
              generated_text = generated_text[1:]

              sys.stdout.write(next_char)
          print()
      
      sys.stdout.flush()

In [9]:
runForCharacter( 'MICHAEL' );

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
nd that is the watch that you are going to wear? N
--- Generating with seed: "nd that is the watch that you are going to wear? N"
------ temperature: 0.2
nd that is the watch that you are going to wear? Now this the and and the wat in the a do he to the to the the ke to the to the want the and the and the the the kere the than that and the and the se to go the a bane wo he to go the me to the wank hat and and the and and wase I go the and the wase and what wase the and and we the wes and and the were ho he the a want the a to the and wowe want the to to and and of and the the the and of an have wo
nd that is the watch that you are going to wear? N
--- Generating with seed: "nd that is the watch that you are going to wear? N"
------ temperature: 0.5
nd that is

### Alright after some tweaking of model structures, we finally got something close to making sense, yet far away from it!
#### Example : 
"ke it's hot. Forward it like it's hot. "Old Schoole, I'm goung to sreat it at out out like me... I'm not! we, gotan. I'm and n't eace?... I'm not goy forgt. . I would toll you that.... I wout't gunting it outfreaver."
#### MANUALLY and poorly tweaking to :
"ke it's hot. Forward it like it's hot. "Old School, I'm young to sweat it at out out like me... I'm not! we, gotan. I'm and n't eace?... I'm not guy <who> forgets. . I would tell you that.... I wouldn't gunting it outforever. "

#### Other Example :
I dnow. No, don'l whave I and overyy? doont, I manz ha1p topre soreace.... Mim.... I wantte furt2n the kus'me the youc. 

#### MANUALLY and poorly tweaking to :
I know. No, don't have I and over? dont, I may he1p to pre soreace.... Mim.... I wantted further the kus'me the your.

#### Not satisfactory enough though :(

#### Let's make an attempt at generating the transcript itself! We will use words as tokens now though!

In [0]:
maxlen = 5

def getData():
    transcripts = []
    for season in season_map_cleaned.keys():
        episodes = season_map_cleaned[season]
        for episode in episodes.keys():
            dialogues = episodes[episode]
            for dialogue in dialogues:
                char = dialogue[0]
                words = dialogue[1].split();
                transcripts.append( words )
    
    sentences = []   #input 
    next_words = []  #output

    for dialogue in transcripts:
        for i in range(0, len(dialogue) - maxlen, step):
            sentences.append( dialogue[i: i+maxlen] )
            next_words.append( dialogue[i+maxlen] )
    
    all_words = []
    for sentence in sentences:
      all_words = all_words + sentence;
    for word in next_words:
      all_words.append( word )
    
    all_words = sorted(list(set(all_words)))
    word_indices = dict((word, all_words.index(word)) for word in all_words)
    
    x = np.zeros( (len(sentences), maxlen, len(all_words)), dtype=np.bool )
    y = np.zeros( (len(sentences), len(all_words)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t,char in enumerate(sentence):
            x[i, t, word_indices[char] ] = 1
        y[i, word_indices[next_words[i]]] = 1
    return x,y, word_indices, transcripts

In [0]:
def runTranscript():
    x, y, word_indices, dialogues = getData();
    words = list(word_indices.keys())
    maxlen = 5

    model = Sequential()
    model.add( LSTM(256, input_shape=(maxlen, len(words))))
    #model.add( LSTM(256, return_sequences=True ) )
    #model.add( LSTM(128))
    model.add( Dense(len(words), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adaDelta')
    
    for i in range(6):
      model.fit(x, y, batch_size=512, epochs=20)

      while(True):
          dialogue = dialogues[random.randint(0, len(dialogues))]
          if len(dialogue) <= maxlen:
              continue
          start_index = random.randint(0, len(dialogue) - maxlen - 1)
          break
      
      for temperature in [0.2, 0.5, 1.0, 1.2]:
          generated_text = dialogue[start_index: start_index + maxlen]
          print('--- Generating with seed: "' + ' '.join(generated_text) + '"')
          print('------ temperature:', temperature)
          sys.stdout.write(' '.join(generated_text))

          # We generate 400 words
          for i in range(400):
              sampled = np.zeros((1, maxlen, len(words)))
              for t, char in enumerate(generated_text):
                  sampled[0, t, word_indices[char]] = 1.

              preds = model.predict(sampled, verbose=0)[0]
              next_index = sample(preds, temperature)
              next_word = words[next_index]

              generated_text.append(next_word)
              generated_text = generated_text[1:]

              sys.stdout.write(' ' + next_word)
          print('\n\n')
      
      sys.stdout.flush()

In [61]:
runTranscript()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
--- Generating with seed: "i really dont like spending"
------ temperature: 0.2
i really dont like spending i i i like i i i i na just i just i i i know na i i just just i i i i i i i just i i i just i like i i just i i na i just i just na like i i i i i just i just i i just i just i na i just like just i just just just na i like i i just i i i like i i na i just i i i i i i i just i like i i i i i i i just just i i i i just i just i i na just na i i i i i just i i i just i just i like just i i like like i i just like just like just i i like just just i na just just like just i just just just i i just i na just just just i i just i i just just just just i i just i i just just like i i just i i i i i i na just just just just just like just just just i like i i

In [0]:
 ### Not much success there....