In [None]:
# for when this is in google colab

from google.colab import drive
drive.mount('/content/drive')
%cd //content/drive/My\ Drive/Colab\ Notebooks/

In [9]:
# Necessary imports
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [42]:
# Load pickled data
df = pd.read_pickle('../data/kaggle_and_reddit_dishes.pkl')

In [43]:
df.sample(5)

Unnamed: 0,cleaned_title
40101,fried egg hollondaise sauce potato bacon hash breakfast dinner ftw
100022,chicory
5692,old micro radish
350403,chateau camponac chateau bottling
39516,ole cooking


## Prep dataset

In [45]:
# Here is the average length of each entry in the dataframe
mean_len = df.cleaned_title.str.len().mean()
mean_len

36.02925479938063

### Firstly, let's get a list of all unique characters in the corpus:

In [46]:
all_text = ''

for index, row in df.iterrows():
    all_text += row['cleaned_title']
    

In [47]:
# create mapping of unique chars to integers
chars = sorted(list(set(all_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [None]:
# create mapping from integers to characters
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [48]:
n_chars = len(all_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  2396630
Total Vocab:  47


### Secondly, let's generate our X and Y labels

In [65]:
# I'm going to use 35 as my window size. We can change this later if we want

window_size = 35

df['str_len'] = df.cleaned_title.str.len()
df['over_35_chars'] = df['str_len'] > window_size

In [50]:
df.sample(5)

Unnamed: 0,cleaned_title,str_len,over_25_chars
15192,big boy burrito,15,False
27554,cherry lime mojito,18,False
28471,cannabis chocolate pudding,26,True
22763,late dessert,12,False
212079,broiled chopped sirloin steak smothered onions french fried idaho potatoes sliced tomato onion,94,True


In [51]:
df.groupby('over_35_chars').count()

Unnamed: 0_level_0,cleaned_title,str_len
over_25_chars,Unnamed: 1_level_1,Unnamed: 2_level_1
False,27234,27234
True,39285,39285


So we'll be able to use 39,285 of our entries to train

In [56]:
# Let's select those specific data points:
df_eligible = df.loc[df['over_35_chars'] == True]['cleaned_title']
len(df_eligible)

39285

In [79]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = window_size
step_size = 3

dataX = []
dataY = []

dataX_text = []
dataY_text = []

for _, entry in df_eligible.iteritems():
    chars_in_entry = len(entry)
    for i in range(0, chars_in_entry - seq_length, step_size):
        seq_in = entry[i:i + seq_length]
        seq_out = entry[i + seq_length]
        
        dataX_text.append(seq_in)
        dataY_text.append(seq_out)
        
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)


Total Patterns:  930936 930936


In [84]:
print(n_chars)
print(n_vocab)
print(n_patterns)

930936
930936
25
1


In [None]:
# Set up X and Y for training

x = np.zeros((n_patterns, seq_length, n_vocab), dtype=np.bool)
y = np.zeros((n_patterns, n_vocab), dtype=np.bool)
for i, sentence in enumerate(dataX_text):
    for t, char in enumerate(sentence):
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[dataY_text[i]]] = 1

## Train LSTM Model

In [76]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(seq_length, n_vocab)))
model.add(Dropout(0.2))
model.add(Dense(n_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:
model.summary()

In [77]:
# define the checkpoint - this will only run in the colab
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [78]:
model.fit(X, y, epochs=1, batch_size=128, callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1

Epoch 00001: loss improved from inf to 2.68139, saving model to weights-improvement-01-2.6814.hdf5


<keras.callbacks.History at 0xb2768d438>