In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

In [3]:
train = pd.read_json("data/train.json")

In [72]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [73]:
train.shape

(39774, 3)

In [74]:
# 20 different cuisines represented here
pd.unique(train.cuisine.ravel())

array([u'greek', u'southern_us', u'filipino', u'indian', u'jamaican',
       u'spanish', u'italian', u'mexican', u'chinese', u'british', u'thai',
       u'vietnamese', u'cajun_creole', u'brazilian', u'french',
       u'japanese', u'irish', u'korean', u'moroccan', u'russian'], dtype=object)

In [75]:
from sklearn import cross_validation

train_features, test_features, train_target, test_target = \
    cross_validation.train_test_split(
        train.ingredients, 
        train.cuisine, 
        test_size=0.2,  # 80/20 split for training and test data
        random_state=10274)

In [82]:
train_features.shape

(31819,)

In [83]:
test_features.shape

(7955,)

In [76]:
import re

# This turns a list of ingredients into a cleaned, single string.
# ex: ['Chicken', 'Ranch! Dressing'] => 'chicken ranch dressing'
def clean_stringify(ingredients):
    return re.sub(
        "[^- a-zA-Z]","",
        ' '.join(ingredients)).strip()

# This turns a list of ingredients into a clean list of ingredients.
# ex: ['Chicken', 'Ranch! Dressing'] => ['chicken', 'ranch dressing']
def _process(item):
    terms = re.sub("[^- a-zA-Z]", "", item).strip().lower().split()
    return ' '.join(terms)

def clean(ingredients):
    return [_process(x) for x in ingredients]

In [77]:
import sklearn.naive_bayes as bayes
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

# Get a vanilla count vectorizer
vectorizer = CountVectorizer()

# tm stands for Term Matrix. This is making a terms matrix.
tm_train = vectorizer.fit_transform(train_features.apply(clean_stringify))

# Fit a model using naive bayes and the training term matrix and target labels
model = bayes.MultinomialNB(alpha=0.1).fit(tm_train, train_target)

# We use transform instead of fit_transform here because we just care about the fitting
# (the terms in the terms matrix) that we put together on the training data. This is
# all we can predict on, so this is all we want to project the test data into.
tm_test = vectorizer.transform(test_features.apply(clean_stringify)).toarray()
predicted = model.predict(tm_test)
metrics.accuracy_score(test_target, predicted)

0.72030169704588309

In [81]:
import operator

# Let's take a look at what the most common terms are
counts = zip(vectorizer.get_feature_names(),
            np.asarray(tm_train.sum(axis=0)).ravel())

sorted(counts, key=operator.itemgetter(1), reverse=True)[:10]

[(u'salt', 14442),
 (u'olive oil', 6341),
 (u'onions', 6317),
 (u'water', 6016),
 (u'garlic', 5883),
 (u'sugar', 5185),
 (u'garlic cloves', 5014),
 (u'butter', 3896),
 (u'ground black pepper', 3792),
 (u'all-purpose flour', 3704)]

In [79]:
# Let's introduce some stop words
vectorizer = CountVectorizer(
    stop_words=[
        'added', 'all', 'assorted',
        'cooking', 'oz', 'organic', 'old', 'purpose', 'low'
        'spice'
    ]
)
tm_train = vectorizer.fit_transform(train_features.apply(clean_stringify))
model = bayes.MultinomialNB(alpha=0.1).fit(tm_train, train_target)
tm_test = vectorizer.transform(test_features.apply(clean_stringify))
predicted = model.predict(tm_test)
metrics.accuracy_score(test_target, predicted)

0.72067881835323699

In [80]:
# Let's make the terms for the term matrix not individual words, but recipe lines
# as a whole. So the terms for an ingredient 'zesty italian dressing' in this case wouldn't 
# be 'zesty', 'italian', and 'dressing', but rather a single term 'zesty italian dressing'
#
# This is achieved using lambda x: x for the analyzer, telling it to just take the ingredients
# list and use that directly as the terms that are pulled out in analysis.
vectorizer = CountVectorizer(analyzer=lambda x: x)
tm_train = vectorizer.fit_transform(train_features.apply(clean))
model = bayes.MultinomialNB(alpha=0.1).fit(tm_train, train_target)
tm_test = vectorizer.transform(test_features.apply(clean))
predicted = model.predict(tm_test)
metrics.accuracy_score(test_target, predicted)

0.7561282212445003