In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
files = []
for i in os.listdir('data/'):
    if i.endswith('.txt'):
        files.append(i)



In [3]:
df = pd.DataFrame(columns=['dialogue'])
for i in files:
    print(i)
    df_loop = pd.read_table('data/'+i, sep="\n", header=None)
    df_loop = df_loop.rename(columns={0:'dialogue'})
    df = df.append(df_loop)


3.txt
10.txt
2_17.txt
21.txt
2_18.txt
11.txt
2_22.txt
2.txt
2_24.txt
5.txt
2_16.txt
2_11.txt
2_6.txt
15.txt
2_4.txt
19.txt
13.txt
2_20.txt
1.txt
2_1.txt
2_9.txt
6.txt
20.txt
24.txt
23.txt
2_14.txt
2_3.txt
14.txt
8.txt
12.txt
2_19.txt
2_12.txt
2_23.txt
2_21.txt
16.txt
2_7.txt
2_5.txt
7.txt
17.txt
22.txt
2_2.txt
9.txt
18.txt
4.txt
2_15.txt


In [4]:
df.head(2)

Unnamed: 0,dialogue
0,Phoebe: (entering) Hi guys!
1,"All: Hey, Pheebs! Hi!"


In [5]:
import re

def delete_blank(x):
    if x=='':
        return None
    else:
        return x
df.dialogue = df.dialogue.map(delete_blank)

In [6]:
df = df.dropna().reset_index(drop=True)

In [7]:
def delete_parentheses(x):
    return re.sub(r'\(.*\)|\[.*\]', '', x)


In [8]:
df.dialogue = df.dialogue.map(lambda x: delete_parentheses(x))
df.dialogue = df.dialogue.map(delete_blank)
df = df.dropna().reset_index(drop=True)

In [9]:
def separate_punctuation(x):
    x = re.sub(r'\.',' . ', x)
    x = re.sub(r'\,',' , ', x)
    x = re.sub(r'\!',' ! ', x)
    x = re.sub(r'\?',' ? ', x)
    x = re.sub(r'\:',' : ', x) 
    return x

def delete_large_spaces(x):
    return re.sub(r'\s{2,}', ' ', x)

In [10]:
df.dialogue = df.dialogue.map(lambda x: separate_punctuation(x))

In [11]:
df.dialogue = df.dialogue.str.lower()

In [12]:
def change_apostrophe(x):
    x = re.sub("let's", 'let us', x)
    x = re.sub("c'mon", 'come on', x)
    x = re.sub("there's", 'there is', x)
    x = re.sub("you're", 'you are', x)
    x = re.sub("we're", 'we are', x)
    x = re.sub("i'm", 'i am', x)
    x = re.sub("y'", 'you', x)
    x = re.sub("how'd", 'how did', x)
    
    return x

In [13]:
df.dialogue = df.dialogue.map(change_apostrophe)

In [14]:
def delete_no_dialogue(x):
    if ':' in x:
        return x

In [15]:
df.head()

Unnamed: 0,dialogue
0,phoebe : hi guys !
1,"all : hey , pheebs ! hi !"
2,"ross : hey . oh , oh , how did it go ?"
3,"phoebe : um , not so good . he walked me to..."
4,all : ohh . ouch .


In [16]:
df.dialogue = df.dialogue.map(delete_no_dialogue)

In [17]:
df = df.dropna().reset_index(drop=True)

# Open glove and tokenize-pad

In [18]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print( "Done.",len(model)," words loaded!")
    return model

In [19]:
model_glove = loadGloveModel('../glove/glove.6B.100d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [20]:
glove_df = pd.DataFrame(model_glove).T

unk_pad_df = pd.DataFrame(columns=glove_df.columns)
unk_pad_df.loc['<PAD>'] = np.zeros(glove_df.shape[1])
unk_pad_df.loc['<UNK>'] = glove_df.mean()

glove_unk_df = pd.concat([unk_pad_df,glove_df])

#### for memory reasons, use just the vocabulary from friends

In [21]:
script_string = list(df.as_matrix().flatten())
vocab_friends = pd.Series(''.join(script_string).split()).drop_duplicates().reset_index(drop=True)
vocab_friends = pd.Series(['<PAD>', '<UNK>']).append(vocab_friends)


In [22]:
glove_unk_friends_df = glove_unk_df[glove_unk_df.index.isin(vocab_friends)]


In [23]:
voc_df = pd.DataFrame(glove_unk_friends_df.index, columns=['voc']).reset_index()
voc_df.voc = voc_df.voc.str.lower()
voc_df = voc_df.set_index('voc')
voc_dic = voc_df.to_dict()['index']

### Tokenize and pad

In [31]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


class TokenizerCustom(Tokenizer):
    def __init__(self, voc, max_len=20, *args, **kwargs):
        super(TokenizerCustom, self).__init__(*args, **kwargs)
        self.max_len = max_len
        self.word_index = voc
        self.oov_token = '<unk>'
        self.filters = '#$%&()*+-/<=>@[\]^_`{|}~'
    
    def pad_string(self, x):
        return pad_sequences(x, maxlen=self.max_len)
    
    def tokenize_string(self, x):
        tok_str = self.texts_to_sequences(pd.Series(x).values)
        return self.pad_string(tok_str)[0]

        

In [32]:
# test unk
tc = TokenizerCustom(voc=voc_dic, oov_token = voc_dic['<unk>'])
tc.tokenize_string("skdjfnjf")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int32)

##### tokenize data

In [33]:
df_tok = df.copy()
df_tok.dialogue = df_tok.dialogue.map(lambda x: tc.tokenize_string(x))

# Train model

In [34]:
X = df_tok.dialogue[:len(df_tok)-1]
X = np.array(X.tolist())
y = df_tok.shift(-1).dialogue[:len(df_tok)-1]
y = np.array(y.tolist())

In [35]:
max_seq_len = len(X[0])
emb_dim = glove_unk_friends_df.shape[1]
vocab_dim = glove_unk_friends_df.shape[0]

In [36]:
y.shape

(11243, 20)

In [37]:
from keras.utils import to_categorical

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    print(y.shape)
    y = y.reshape(sequences.shape[0], vocab_size, sequences.shape[1])
    return y

y_enc = encode_output(y, vocab_dim)

MemoryError: 

In [38]:
y_enc = y

In [None]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    print(y.shape)
    return y

y_enc = encode_output(y, vocab_dim)

In [58]:
from keras.models import Model
from keras.layers import Embedding, Input, Dense, LSTM, Dropout, RepeatVector, Flatten
from keras.layers.wrappers import TimeDistributed

from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras import regularizers



inp = Input(shape=(max_seq_len,))
emb = Embedding(vocab_dim, emb_dim, weights=[glove_unk_friends_df], 
                input_length=max_seq_len, trainable=False)(inp)
lstm_in = LSTM(200, return_sequences=True)(emb)

lstm_out = LSTM(200)(lstm_in)
#d1 = Dense(400, activation='relu')(lstm_out)
#d2 = Dense(100, activation='relu')(d1)
out  = Dense(max_seq_len, activation='relu')(lstm_out)

'''

rep_vec = RepeatVector(vocab_dim)(lstm_in)
lstm_out = LSTM(256, return_sequences=True)(rep_vec)
out = TimeDistributed(Dense(max_seq_len, activation='softmax'))(lstm_out)

'''

"\n\nrep_vec = RepeatVector(vocab_dim)(lstm_in)\nlstm_out = LSTM(256, return_sequences=True)(rep_vec)\nout = TimeDistributed(Dense(max_seq_len, activation='softmax'))(lstm_out)\n\n"

In [69]:
earlystop = EarlyStopping(monitor='acc', min_delta=0.001, patience=5)
model = Model(inputs=inp, outputs=out)
model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0001), metrics=['accuracy'])

In [70]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 20, 100)           603800    
_________________________________________________________________
lstm_7 (LSTM)                (None, 20, 200)           240800    
_________________________________________________________________
lstm_8 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dense_9 (Dense)              (None, 20)                4020      
Total params: 1,169,420
Trainable params: 565,620
Non-trainable params: 603,800
_________________________________________________________________


In [71]:
model.fit(X, y_enc, epochs = 100, callbacks=[earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<keras.callbacks.History at 0x7f34045866a0>

In [None]:
model.predict(X[0].reshape(-1,50)) 

In [None]:
X[0].shape