###Kaggle: What's Cooking?

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [2]:
# Import train data and read to df
path = '/Users/dominicdebiaso/Development/datasets/'
cooking_train_data = path + 'kaggle_cooking_train.json'
df = pd.read_json(cooking_train_data)

In [3]:
# EDA
# df.cuisine.unique()
# Number of unique ingredients
# ingredients_unique = [x for sublist in ingredients for x in sublist]
# len(set(ingredients_unique))

In [4]:
# Initialize 'CountVectorizer' object (ie. the bag of words tool). To limit
# the number of feature vectors, the 2200 most frequent are chosen. Since 
# the words are represented as a vector, it'll be a sparse matrix.
# Further, 'fit' and 'transform' modeling is stored within the vectorizer.
vectorizer = CountVectorizer(max_features = 2200)

# Extract ingredients and convert them to a single list of strings
# Replace commas with a whitespace for each ingredient list
ingredients = df['ingredients']
ingredients_list = [' '.join(x) for x in ingredients]

## Bag of Words representation
# Machine learning on text documents requires text content be converted
# into numerical feature vectors. An integer id is assigned to each word
# occuring in the document. For each document, count the number of times
# the word occurs (i) and store it next to it's integer value (j); x[i, j].

# Fit estimator and learn vocab list
# Then transform training data into feature vectors
# Input should be a list of strings

# Fit and then transform, make it possible to fit on training data and 
# transform on test data. So transformation on training and testing 
# dataset are identical, and the tranformed data can be compared and 
# computed together.

# Using fit_transform on two datasets would result in two different
# coordinate systems for the transformed data. So the transformed result 
# cannot be computer or computed together.

# Fits the training data
# vectorizer.fit(ingredients_list)
bag_of_words = vectorizer.fit(ingredients_list)

# Transform words into vector representation
# e.g. (document id, word id) -- word frequency
# Then convert to numpy arrays from scipy sparse matrix
# The array contains 0s for words not found in that particular document
bag_of_words_asArray = vectorizer.transform(ingredients_list).toarray()

# View shape
# print(bag_of_words_asArray.shape)

# Vectors of a given document id
# print(bag_of_words_asArray[1999])

# Vector of a given ingredient
# vectorizer.vocabulary_.get(u'parsley')

In [5]:
# Import test data
cooking_test_data = path + 'kaggle_cooking_test.json'
df_test = pd.read_json(cooking_test_data)

# Perform similar ingredient transformation
ingredients_test = df_test['ingredients']
ingredients_list_test = [' '.join(x) for x in ingredients_test]
ingredients_asArray_test = vectorizer.transform(ingredients_list_test).toarray()

####Supervised Text Classification

In [136]:
# Random Forest fits a number of decision trees on various sub-samples of
# dataset and use averaging to improve predictive accuracy and control
# over-fitting. A form of ensemble learning that relies on an ensemble
# of decision trees.

# Initialize random forest classifier with 800 trees and fit with bag of words
# Only the hyperparameter 'n_estimators' was used
# random_forest = RandomForestClassifier(n_estimators = 800)
# random_forest = random_forest.fit(bag_of_words_asArray, df['cuisine'])
# print(random_forest.score(ingredients_asArray, df.cuisine))
# random_forest_predictions = random_forest.predict(ingredients_asArray_test)

# .76559

In [141]:
# Multinomial Naive Bayes is a classification technique with discrete 
# features (ie. words counts for text classification). Given prior 
# probabilities of a given word(s) in a document, calculate the 
# posterior probability that a set of words will be in some class.

# multi_naive_bayes = MultinomialNB()
# multi_naive_bayes = multi_naive_bayes.fit(bag_of_words_asArray, df['cuisine'])
# multi_naive_bayes_predictions = multi_naive_bayes.predict(ingredients_asArray_test)

# .72084

In [21]:
# Support Vector Machines create a hyperplane (or decision boundary) that 
# maximizes the distance to the nearest points or margin. Correct 
# classification of a point is more important than maximizing the margin.

# Kernal Trick turns a non-separable cluster of points into a high input/
# high dimensional space.

# Parameters: kernal-linear, rbf (default and non-linear); C-controls 
# tradeoff btwn decision boundary and correct point classification (the
# more non-linear and wiggly boundary results in higher change of overfitting),
# larger C more correct points; gamma-how non-linear the boundary is.

# Use linear kernal when no. features is larger than no. obserations,
# use gaussian kernal when no. observations is larger than no. features,
# use linear kernal if no. observations is greater than 50k for speed.

# supp_vec = SVC()
# supp_vec = supp_vec.fit(bag_of_words_asArray, df['cuisine'])
# supp_vec_predictions = supp_vec.predict(ingredients_asArray_test)

# .57683

In [24]:
# Stochastic Gradient Descent looks at the errors of each point and 
# determines which way they're pointing (slope tells you which way 
# to go) and then move that way until the errors stop getting smaller. 

stoch_grad = SGDClassifier()
stoch_grad = stoch_grad.fit(bag_of_words_asArray, df['cuisine'])
stoch_grad_predictions = stoch_grad.predict(ingredients_asArray_test)

# .74356

In [None]:
# k-Nearest Neighbors

In [25]:
# Print results output to csv
# model_predictions = random_forest_predictions
# model_predictions = multi_naive_bayes_predictions
# model_predictions = supp_vec_predictions
model_predictions = stoch_grad_predictions 
d = {'id': df_test['id'], 'cuisine': model_predictions}
output = pd.DataFrame(data = d)
output.to_csv("/Users/dominicdebiaso/Desktop/kaggle_whats_cooking_model.csv", index=False)

In [None]:
# Sources
# http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# https://www.kaggle.com/normallydistrbuted/whats-cooking/simple-bag-of-words-with-rf/code
# https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors