Neural network using Keras for Kaggle's "What's Cooking" competition

### Get the kaggle files

In [1]:
# !kaggle competitions download -c whats-cooking
# !unzip test.json.zip
# !unzip train.json.zip
# !unzip sample_submission.csv.zip
# !rm test.json.zip
# !rm train.json.zip
# !rm sample_submission.csv.zip

### Imports

In [23]:
# Imports
import os

# Scientific Python
import pandas as pd
import numpy as np

# NLTK
import nltk
from nltk.stem import WordNetLemmatizer

# Keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling1D, Conv1D, Dense, LSTM, Embedding, Flatten, Dropout

%matplotlib inline
# nltk.download('wordnet')
stemmer = WordNetLemmatizer()

### Read data

In [3]:
# Read train dataset
train = pd.read_json('train.json').set_index('id')
test = pd.read_json('test.json').set_index('id')

In [4]:
# Rename the columns
train.columns = ['target', 'values']
test.columns = ['values']

In [5]:
train.head()

Unnamed: 0_level_0,target,values
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


### Preprocess

In [6]:
# 6714 unique ingredients
ingredients = train['values'].tolist()
all_ingredients = [item for sublist in ingredients for item in sublist]

In [7]:
# Categories
num_classes = len(train.target.value_counts())
print(num_classes)

20


In [8]:
def split_str(ls):
    new = []
    for s in ls:
        new.append(s.split(' '))
    ing = [item for sublist in new for item in sublist]
    return ing

def lemmatize(ls):
    new = []
    for s in ls:
        new.append(stemmer.lemmatize(s))
    return new

In [9]:
# Split on space
train['values'] = train['values'].apply(split_str)
test['values'] = test['values'].apply(split_str)

# Lemmatize
train['values'] = train['values'].apply(lemmatize)
test['values'] = test['values'].apply(lemmatize)

In [10]:
def preprocess_x(df):
    word_list = list(df['values'])
    # Tokenize
    token = Tokenizer(filters=', ')
    token.fit_on_texts(word_list)
    encoded_doc = token.texts_to_matrix(df['values'], mode='count')
    return encoded_doc, token

def preprocess_y(df):
    word_list = list(df['target'])
    # Tokenize
    token = Tokenizer(filters=', ')
    token.fit_on_texts(word_list)
    encoded_doc = token.texts_to_sequences(df['target'])
    encoded_doc = np.array(encoded_doc).reshape(len(encoded_doc), 1)
    y = pd.get_dummies(encoded_doc.reshape(-1)).values
    return y, token

In [11]:
# Preprocess
X_train, token_x = preprocess_x(train)
X_test = token_x.texts_to_matrix(test['values'], mode='count')

In [12]:
# Preprocess output
y_train, token_y = preprocess_y(train)

### Train

In [13]:
def create_model(input_lenght, num_classes):
    model = Sequential()
    model.add(Dense(240, input_shape=(input_lenght,), activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [14]:
# Compile
model = create_model(X_train.shape[1], num_classes)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 240)               723120    
_________________________________________________________________
dense_1 (Dense)              (None, 120)               28920     
_________________________________________________________________
dense_2 (Dense)              (None, 60)                7260      
_________________________________________________________________
dense_3 (Dense)              (None, 20)                1220      
Total params: 760,520
Trainable params: 760,520
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Fit
model.fit(X_train, y_train, epochs=5, validation_split=0.15)

Train on 33807 samples, validate on 5967 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x13498d668>

In [16]:
# Recover classes
y_test = model.predict_classes(X_test) + 1
y_test = np.array(y_test).reshape(len(y_test), 1)
y_test = token_y.sequences_to_texts(y_test)

In [17]:
test['cuisine'] = y_test

In [18]:
# Prepare output
del test['values']
test = test.reset_index()

In [19]:
test.head()

Unnamed: 0,id,cuisine
0,18009,british
1,28583,southern_us
2,41580,italian
3,29752,cajun_creole
4,35687,italian


In [20]:
# Save to csv
# test.to_csv('test.csv', index=False)

In [21]:
# Push to kaggle
# !kaggle competitions submit -f test.csv -m neural_network whats-cooking

In [22]:
# View submissions
# !kaggle competitions submissions whats-cooking