In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Read in data

In [3]:
df = pd.read_csv("../data/recipe_phrases.csv", index_col=None,dtype={'phrase': np.str, 'is_ingredient': np.int})
df['phrase'] = df['phrase'].str.strip().str.lower().str.replace('?', '/')
df = df.fillna('')

In [4]:
X = df.phrase.values
y = df.is_ingredient.values

In [5]:
# Tokenize data

In [6]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
data_X = pad_sequences(sequences, maxlen=100)

In [7]:
print(data_X)

[[  0   0   0 ... 631   2 813]
 [  0   0   0 ...  19   1  32]
 [  0   0   0 ...   8  72  59]
 ...
 [  0   0   0 ...  39  41 153]
 [  0   0   0 ... 134 109 807]
 [  0   0   0 ...  22   4  23]]


In [8]:
# CNN with LSTM

In [9]:
model = Sequential()
model.add(Embedding(20000, 128, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [10]:
# Train

In [11]:
model.fit(data_X, y, validation_split=0.5, epochs=3)

Train on 4642 samples, validate on 4643 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a19b3d320>

In [12]:
# Test Recipe

In [16]:
newtexts = ["1/3 t salt", "1 c sugar","dump sugar into bowl","this is not 10 an ingredient","1 23 is a lucky number", "I had 1 cup that broke yesterday."]
sequences = tokenizer.texts_to_sequences(newtexts)
data = pad_sequences(sequences, maxlen=100)
predictions = model.predict(data)
print(predictions)

[[9.9664646e-01]
 [9.9202311e-01]
 [6.8667252e-04]
 [8.7784918e-04]
 [1.4500754e-01]
 [6.4747065e-02]]
