In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from keras.utils.np_utils import to_categorical
from keras import models
from keras import layers
from keras.callbacks import EarlyStopping
from collections import Counter

# 1. Load in Data and Train-Validation Split

In [None]:
df=pd.read_json('../input/train.json')

In [None]:
df.cuisine = pd.Categorical(df.cuisine)

In [None]:
df['cuisinecode']=df.cuisine.cat.codes

In [None]:
test=pd.read_json('../input/test.json')

In [None]:
df.shape

In [None]:
test.shape

In [None]:
df.columns

In [None]:
test.columns

In [None]:
df['length']=df['ingredients'].map(len)

In [None]:
test['length']=test['ingredients'].map(len)

In [None]:
train, valid = train_test_split(df, 
                                stratify=df['cuisine'], 
                                test_size=.2,
                                random_state=42)

In [None]:
train=train.copy()

In [None]:
valid=valid.copy()

# B. Make vocabulary from the ingredients and the words in the ingredients

In [None]:
train.head()

In [None]:
cuisine_list=list(train.cuisine.value_counts().index)

In [None]:
mu=train['length'].mean()

In [None]:
sigma=train['length'].std()

In [None]:
train['length']=(train['length']-mu)/sigma
valid['length']=(valid['length']-mu)/sigma
test['length']=(test['length']-mu)/sigma

In [None]:
train.head()

In [None]:
def breakout_ingredients(df):
    df['ingredients2']=[" ".join(entry) for entry in df['ingredients']]
    df['ingredients3']=[entry.split(" ") for entry in df['ingredients2']]
    
    def combine(row):
        first=set(row['ingredients'])
        second=set(row['ingredients3'])
        return list(first.union(second))
    
    df['ingredients4']=df.apply(combine, axis=1)
    

In [None]:
breakout_ingredients(train)
breakout_ingredients(valid)
breakout_ingredients(test)

In [None]:
# Code courtesy of http://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/

def dummy_fun(doc):
    return doc

cv = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    ngram_range=(1,1),
    max_df=.99,
    min_df=1)  

In [None]:
cv.fit(train.ingredients4)

In [None]:
vocab_size=len(cv.vocabulary_)

In [None]:
print("The vocabulary size is {}".format(vocab_size))

In [None]:
train_vectorized=cv.fit_transform(train['ingredients4'])

In [None]:
valid_vectorized=cv.transform(valid['ingredients4'])

In [None]:
test_vectorized=cv.transform(test['ingredients4'])

In [None]:
one_hot_train_labels=to_categorical(train['cuisinecode'])

In [None]:
one_hot_valid_labels=to_categorical(valid['cuisinecode'])

In [None]:
def add_column(source_matrix, df, source_column):
    try:
        source_matrix=source_matrix.todense()
    except:
        print ("Already dense")
    length_matrix=df[source_column].as_matrix()
    length_matrix=length_matrix.reshape((length_matrix.shape[0],1))
    print ("Source shape", source_matrix.shape)
    print ("Column shape", length_matrix.shape)
    result=np.hstack([source_matrix, length_matrix])
    print ("Result shape", result.shape)
    return result
    

In [None]:
train_vectorized=add_column(train_vectorized, train, 'length')
print ("Train done")
valid_vectorized=add_column(valid_vectorized, valid, 'length')
print ("Valid done")
test_vectorized=add_column(test_vectorized, test, 'length')
print ("Test done")

In [None]:
one_hot_train_labels.shape

In [None]:
one_hot_valid_labels.shape

In [None]:
extra_cols=1

In [None]:
model=models.Sequential()

In [None]:
model.add(layers.Dense(6000, 
                       activation='relu', 
                       input_shape=(vocab_size+extra_cols,)))

In [None]:
model.add(layers.Dropout(0.9)) # refers to nodes in the first hidden layer

In [None]:
model.add(layers.Dense(1024, 
                       activation='relu', 
                       ))

In [None]:
model.add(layers.Dropout(0.8)) # refers to nodes in the first hidden layer

In [None]:
model.add(layers.Dense(20, activation='softmax'))

In [None]:
model.compile(optimizer='Adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=1, mode='auto')

callbacks_list = [early_stop]


In [None]:
history=model.fit(train_vectorized,
                 one_hot_train_labels,
                 epochs=20,
                 batch_size=512,
                 validation_data=(valid_vectorized, one_hot_valid_labels),
                  callbacks=callbacks_list)

In [None]:
plt.figure(1, figsize=(10, 5))

history_dict=history.history
loss_values=history_dict['loss']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values)+1)

plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label = 'Validation loss')

plt.title('Training and validation loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

In [None]:
plt.figure(1, figsize=(10, 5))

history_dict=history.history
loss_values=history_dict['acc']
val_loss_values = history_dict['val_acc']

epochs = range(1, len(loss_values)+1)

plt.plot(epochs, loss_values, 'bo', label='Training accuracy')
plt.plot(epochs, val_loss_values, 'b', label = 'Validation accuracy')

plt.title('Training and validation accuracy')

plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
answer=model.predict_classes(test_vectorized)

In [None]:
answer.shape

In [None]:
test['guess']=answer

In [None]:
test['cuisine']=test['guess'].map(lambda x: df.cuisine.cat.categories[x])

In [None]:
submission=test[['id','cuisine']]

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()