In [None]:
cd ../

In [None]:
import numpy as np
import pandas as pd
import os

from friends_bot.glove_loader import GloveLoader
from friends_bot.dialogue_cleaner import DialogueCleaner
from friends_bot.data_cleaner import TokenizerCustom, remap_words_overall
from friends_bot.model_trainer import define_model, compile_model

import warnings
warnings.filterwarnings("ignore")

# Load Glove

In [None]:
glove_loader = GloveLoader()


In [None]:
glove_unk_df = glove_loader.run_glove_loader('../glove/glove.6B.100d.txt')

# Load Friend Scripts

In [None]:
files = []
for i in os.listdir('data/'):
    if i.endswith('.txt'):
        files.append(i)

df = pd.DataFrame(columns=['dialogue'])
for i in files:
    df_loop = pd.read_table('data/'+i, sep="\n", header=None)
    df_loop = df_loop.rename(columns={0:'dialogue'})
    df = df.append(df_loop)
df = df.reset_index(drop=True)
df = df.head(10000)

# Clean the data

In [None]:
dialogue_cleaner = DialogueCleaner()


In [None]:
dialogue = dialogue_cleaner.run_dialogue_cleaner(df.dialogue)
answer = dialogue.shift(-1)

dialogue = dialogue.astype(str)
answer = answer.astype(str)

# Prepare the dictionaries to tokenise the data

#### Input dialogue

In [None]:
script_string = list(dialogue.as_matrix().flatten())
vocab_friends = pd.Series(pd.Series(''.join(script_string).split()).value_counts().index)
vocab_friends = pd.Series(['<PAD>', '<UNK>', '<BEGIN>', '<END>']).append(vocab_friends)
glove_unk_friends_df = glove_unk_df[glove_unk_df.index.isin(vocab_friends)]
voc_df = pd.DataFrame(glove_unk_friends_df.index, columns=['voc']).reset_index()
voc_df.voc = voc_df.voc.str.lower()
voc_df = voc_df.set_index('voc')
voc_dic = voc_df.to_dict()['index']



#### Answers: Keep only the top 200 works

In [None]:

top_words = 200
script_string = list(answer.as_matrix().flatten())
most_common_words = pd.Series(''.join(script_string).split()).value_counts().head(top_words)
vocab_friends = pd.Series(most_common_words.index)
vocab_friends = pd.Series(['<PAD>', '<UNK>', '<BEGIN>', '<END>']).append(vocab_friends)
glove_unk_friends_ans_df = glove_unk_df[glove_unk_df.index.isin(vocab_friends)]
voc_df = pd.DataFrame(glove_unk_friends_ans_df.index, columns=['voc']).reset_index()
voc_df.voc = voc_df.voc.str.lower()
voc_df = voc_df.set_index('voc')

voc_dic_ans = voc_df.to_dict()['index']
voc_dic_inv = {voc_dic_ans[x]:x for x in voc_dic_ans}

# Tokenize the data

In [None]:
custom_tok = TokenizerCustom(voc_dic)
custom_tok_ans = TokenizerCustom(voc_dic_ans)

In [None]:
X = dialogue.map(lambda x: custom_tok.tokenize_string(x))
X = np.array(X.tolist())
y = answer.map(lambda x: custom_tok_ans.tokenize_string(x))
y = np.array(y.tolist())

# Remap most common words into different ids

In [None]:
voc_dic_inv_copy = voc_dic_inv.copy()
counts_ser = pd.Series(y.flatten()).value_counts()
data_words = pd.Series(y.flatten()).value_counts().index
min_count = pd.Series(y.flatten()).value_counts().iloc[-1]
if min_count<10:
    min_count = 10
for i in range(len(data_words)):
    word = data_words[i]
    y, voc_dic_inv = remap_words_overall(y, int(counts_ser.loc[word]/min_count), word, voc_dic_inv)


In [None]:
max_seq_len = len(X[0])
emb_dim = glove_unk_friends_df.shape[1]
vocab_dim = glove_unk_friends_df.shape[0]
vocab_out_dim = y.flatten().max()+1

# Build and train the model

In [None]:
inp, out = define_model(max_seq_len, vocab_dim, vocab_out_dim, emb_weights=glove_unk_friends_df)

In [None]:
model = compile_model(inp, out)

In [None]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
earlystop = EarlyStopping(monitor='acc', min_delta=0.00001, patience=5)
reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.4, patience=5, min_lr=0.0005, verbose=1)
model.fit([X], [y.reshape(y.shape[0] , y.shape[1], 1)], nb_epoch=1000, batch_size=32, shuffle=True,\
           callbacks=[reduce_lr])