In [143]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
files = []
for i in os.listdir('data/'):
    if i.endswith('.txt'):
        files.append(i)



In [3]:
df = pd.DataFrame(columns=['dialogue'])
for i in files:
    print(i)
    df_loop = pd.read_table('data/'+i, sep="\n", header=None)
    df_loop = df_loop.rename(columns={0:'dialogue'})
    df = df.append(df_loop)


3.txt
2.txt
5.txt
1.txt
4.txt


In [128]:
df.head(2)

Unnamed: 0,dialogue
0,phoebe : hi guys !
1,"all : hey , pheebs ! hi !"


In [6]:
import re

def delete_blank(x):
    if x=='':
        return None
    else:
        return x
df.dialogue = df.dialogue.map(delete_blank)

In [7]:
df = df.dropna().reset_index(drop=True)

In [8]:
def delete_parentheses(x):
    return re.sub(r'\(.*\)|\[.*\]', '', x)


In [9]:
df.dialogue = df.dialogue.map(lambda x: delete_parentheses(x))
df.dialogue = df.dialogue.map(delete_blank)
df = df.dropna().reset_index(drop=True)

In [10]:
def separate_punctuation(x):
    x = re.sub(r'\.',' . ', x)
    x = re.sub(r'\,',' , ', x)
    x = re.sub(r'\!',' ! ', x)
    x = re.sub(r'\?',' ? ', x)
    x = re.sub(r'\:',' : ', x) 
    return x

def delete_large_spaces(x):
    return re.sub(r'\s{2,}', ' ', x)

In [11]:
df.dialogue = df.dialogue.map(lambda x: separate_punctuation(x))

In [12]:
df.dialogue = df.dialogue.str.lower()

In [13]:
def change_apostrophe(x):
    x = re.sub("let's", 'let us', x)
    x = re.sub("c'mon", 'come on', x)
    x = re.sub("there's", 'there is', x)
    x = re.sub("you're", 'you are', x)
    x = re.sub("we're", 'we are', x)
    x = re.sub("i'm", 'i am', x)
    x = re.sub("y'", 'you', x)
    x = re.sub("how'd", 'how did', x)
    
    return x

In [14]:
df.dialogue = df.dialogue.map(change_apostrophe)

In [15]:
def delete_no_dialogue(x):
    if ':' in x:
        return x

In [16]:
df.dialogue = df.dialogue.map(delete_no_dialogue)

In [30]:
df = df.dropna()

# Open glove and tokenize-pad

In [18]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print( "Done.",len(model)," words loaded!")
    return model

In [19]:
model_glove = loadGloveModel('../glove/glove.6B.100d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [96]:
glove_df = pd.DataFrame(model_glove).T

unk_pad_df = pd.DataFrame(columns=glove_df.columns)
unk_pad_df.loc['<PAD>'] = np.zeros(glove_df.shape[1])
unk_pad_df.loc['<UNK>'] = glove_df.mean()

glove_unk_df = pd.concat([unk_pad_df,glove_df])

#### for memory reasons, use just the vocabulary from friends

In [144]:
script_string = list(df.as_matrix().flatten())
vocab_friends = pd.Series(''.join(script_string).split()).drop_duplicates().reset_index(drop=True)
vocab_friends = pd.Series(['<PAD>', '<UNK>']).append(vocab_friends)


In [145]:
glove_unk_friends_df = glove_unk_df[glove_unk_df.index.isin(vocab_friends)]


In [146]:
voc_df = pd.DataFrame(glove_unk_friends_df.index, columns=['voc']).reset_index()
voc_df.voc = voc_df.voc.str.lower()
voc_df = voc_df.set_index('voc')
voc_dic = voc_df.to_dict()['index']

### Tokenize and pad

In [147]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


class TokenizerCustom(Tokenizer):
    def __init__(self, voc, max_len=30, *args, **kwargs):
        super(TokenizerCustom, self).__init__(*args, **kwargs)
        self.max_len = max_len
        self.word_index = voc
        self.oov_token = '<unk>'
        self.filters = '#$%&()*+-/<=>@[\]^_`{|}~'
    
    def pad_string(self, x):
        return pad_sequences(x, maxlen=self.max_len)
    
    def tokenize_string(self, x):
        tok_str = self.texts_to_sequences(pd.Series(x).values)
        return self.pad_string(tok_str)[0]

        

In [148]:
# test unk
tc = TokenizerCustom(voc=voc_dic, oov_token = voc_dic['<unk>'])
tc.tokenize_string("skdjfnjf")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

##### tokenize data

In [149]:
df_tok = df.copy()
df_tok.dialogue = df_tok.dialogue.map(lambda x: tc.tokenize_string(x))

# Train model

In [None]:
yl

In [152]:
X = df_tok.dialogue[:len(df_tok)-1]
X = np.array(X.tolist())
y = df_tok.shift(-1).dialogue[:len(df_tok)-1]
y = np.array(y.tolist())

In [164]:
max_seq_len = len(X[0])
emb_dim = glove_unk_friends_df.shape[1]
vocab_dim = glove_unk_friends_df.shape[0]

In [197]:
from keras.utils import to_categorical

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], vocab_size, sequences.shape[1])
    return y

y_enc = encode_output(y, vocab_dim)

In [None]:
 (1673, 30) but got array with shape (1290, 1673)

In [192]:
y_enc.shape
(1673, 30) but got array with shape (30, 1673)


(1290, 30, 1673)

In [209]:
from keras.models import Model
from keras.layers import Embedding, Input, Dense, LSTM, Dropout, RepeatVector, Flatten
from keras.layers.wrappers import TimeDistributed

from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras import regularizers



inp = Input(shape=(max_seq_len,))
emb = Embedding(vocab_dim, emb_dim, weights=[glove_unk_friends_df], 
                input_length=max_seq_len, trainable=False)(inp)
lstm_in = LSTM(20)(emb)
rep_vec = RepeatVector(vocab_dim)(lstm_in)
lstm_out = LSTM(20, return_sequences=True)(rep_vec)
out = TimeDistributed(Dense(max_seq_len, activation='softmax'))(lstm_out)


In [217]:
earlystop = EarlyStopping(monitor='acc', min_delta=0.0011, patience=5)
model = Model(inputs=inp, outputs=out)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])

In [218]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 30)                0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 30, 100)           167300    
_________________________________________________________________
lstm_16 (LSTM)               (None, 20)                9680      
_________________________________________________________________
repeat_vector_7 (RepeatVecto (None, 1673, 20)          0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 1673, 20)          3280      
_________________________________________________________________
time_distributed_7 (TimeDist (None, 1673, 30)          630       
Total params: 180,890
Trainable params: 13,590
Non-trainable params: 167,300
_________________________________________________________________

In [None]:
model.fit(X, y_enc, epochs = 100, callbacks=[earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
 256/1290 [====>.........................] - ETA: 22s - loss: 0.0603 - acc: 0.9829

In [184]:
model.predict(X[0].reshape(-1,50))

(1290, 30, 1673)

In [334]:
X[0].shape

(50,)