In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import json
import csv
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.decomposition import PCA
from sklearn.mixture import GMM


In [2]:
submission_ids, submission_data = [], []
test_data, test_labels = [], []
train_data, train_labels = [], []
mini_train_data, mini_train_labels = [], []
full_train_labels, full_train_data = [], []
target_names, target_labels = [], []

In [68]:
def parse_all_data(per_recipe_function=None):
    global submission_ids, submission_data, test_data, test_labels, train_data, train_labels
    global mini_train_data, mini_train_labels, full_train_labels, full_train_data, target_names, target_labels
    def parse_data(key_name, raw_data):
        keys, data = [], []
        for recipe in raw_data:
            keys.append(recipe[key_name])
            ingredient_list = " ".join([x.replace(" ","_") for x in recipe["ingredients"]])
            ingredient_list = re.sub(r'[^A-Za-z\s_]', '', ingredient_list)
            ingredient_list = re.sub(r'_+oz_', '', ingredient_list)
            ingredient_list = re.sub(r' _+', '', ingredient_list)
            # This adds the word *count* to the list of ingredients equal to the number of ingredients
            # this results in a vectorization that includes a last feature that has a count
            # equal to the number of ingredients.
            ingredient_list += (" *count*" * len(ingredient_list.split(" ")))  

            if per_recipe_function is not None:
                ingredient_list = per_recipe_function(ingredient_list)

            data.append(ingredient_list)
        return keys, data

    with open('train.json') as json_train_data:
        train_raw = json.load(json_train_data)

    with open('test.json') as json_test_data:
        test_raw = json.load(json_test_data)

    full_train_labels, full_train_data = parse_data("cuisine", train_raw)

    target_names = list(set(full_train_labels))
    full_train_labels = np.array(full_train_labels)

    submission_ids, submission_data = parse_data("id", test_raw)

    num_test = len(full_train_labels)
    test_data, test_labels = full_train_data[num_test/2:], full_train_labels[num_test/2:]
    train_data, train_labels = full_train_data[:num_test/2], full_train_labels[:num_test/2]

    mini_train_data = train_data[:7000]
    mini_train_labels = train_labels[:7000]


    
parse_all_data()

print target_names
print train_labels.shape

[u'irish', u'mexican', u'chinese', u'filipino', u'vietnamese', u'moroccan', u'brazilian', u'japanese', u'british', u'greek', u'indian', u'jamaican', u'french', u'spanish', u'russian', u'cajun_creole', u'thai', u'southern_us', u'korean', u'italian']
(19887L,)


In [4]:
this_train_data = mini_train_data
this_train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(this_train_data)

features = vectorizer.get_feature_names()

print sorted(features)[0:10]

[u'_lowfat_cottage_cheese', u'_lowfat_milk', u'_reducedfat_milk', u'a_taste_of_thai_rice_noodles', u'aai', u'abalone', u'abbamele', u'accent', u'accent_seasoning', u'achiote']


In [5]:
this_train_data = mini_train_data
this_train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(this_train_data)

c = pow(2.0,np.arange(6)-6)
parameters = {'C': c}
lr = LogisticRegression()
lr_clf = GridSearchCV(lr,parameters,scoring='accuracy')
lr_clf.fit(train_docterm, this_train_labels)
print lr_clf.best_score_

#alpha = pow(2.0,np.arange(24)-12)
#parameters = {'alpha': alpha}
#mnb = MultinomialNB()
#mnb_clf = GridSearchCV(mnb,parameters,scoring='accuracy')
#mnb_clf.fit(train_docterm, train_labels)
#print mnb_clf.best_score_

0.698571428571


In [6]:
this_train_data = mini_train_data
this_train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(this_train_data)
total_vocab = vectorizer.vocabulary_
total_vocab_byidx = dict((v,k) for k,v in vectorizer.vocabulary_.iteritems())

vocab_sizes = []
accuracies = []

for c_l1 in pow(2.0,np.arange(12)-6):

    # Train LogisticRegression with L1 penalty and specific C
    lr_l1 = LogisticRegression(penalty='l1', tol=.01, C=c_l1)
    lr_l1.fit(train_docterm, this_train_labels)

    # Create a pruned vocabulary based on non-zero features from LogisticRegression with L1 penalty
    pruned_vocab = set()
    for f in lr_l1.coef_:
        pruned_vocab.update([total_vocab_byidx[i] for i in np.flatnonzero(f)])

    # Create a pruning vectorizer
    pruning_vectorizer = CountVectorizer(vocabulary=pruned_vocab)
    pruned_train_docterm = pruning_vectorizer.fit_transform(this_train_data)

    # Train LogisticRegression with L2 penalty and pruned vocabulary
    c = pow(2.0,np.arange(12)-12)
    parameters = {'penalty' : ['l2'], 'C': c}
    lr = LogisticRegression()
    lr_clf = GridSearchCV(lr,parameters,scoring='accuracy')
    lr_clf.fit(pruned_train_docterm, this_train_labels)
    c_l2 = lr_clf.best_params_['C']

    print "L1 C: %10.5f  Vocabulary Size: %5d  L2 Accuracy: %.5f  L2 C: %10.5f" % (c_l1, len(pruned_vocab), lr_clf.best_score_, c_l2)

L1 C:    0.01562  Vocabulary Size:    31  L2 Accuracy: 0.45743  L2 C:    0.50000
L1 C:    0.03125  Vocabulary Size:    74  L2 Accuracy: 0.55243  L2 C:    0.50000
L1 C:    0.06250  Vocabulary Size:   167  L2 Accuracy: 0.62957  L2 C:    0.50000
L1 C:    0.12500  Vocabulary Size:   312  L2 Accuracy: 0.67271  L2 C:    0.50000
L1 C:    0.25000  Vocabulary Size:   520  L2 Accuracy: 0.68971  L2 C:    0.50000
L1 C:    0.50000  Vocabulary Size:   832  L2 Accuracy: 0.69957  L2 C:    0.50000
L1 C:    1.00000  Vocabulary Size:  1297  L2 Accuracy: 0.70300  L2 C:    0.50000
L1 C:    2.00000  Vocabulary Size:  2217  L2 Accuracy: 0.70157  L2 C:    0.50000
L1 C:    4.00000  Vocabulary Size:  2517  L2 Accuracy: 0.70057  L2 C:    0.50000
L1 C:    8.00000  Vocabulary Size:  2667  L2 Accuracy: 0.70029  L2 C:    0.50000
L1 C:   16.00000  Vocabulary Size:  2843  L2 Accuracy: 0.69957  L2 C:    0.50000
L1 C:   32.00000  Vocabulary Size:  3071  L2 Accuracy: 0.69943  L2 C:    0.50000


In [57]:
def create_submission_csv(classifier=None, data=None, predictions=None):
    with open('submission.csv', 'wb') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['id', 'cuisine'])
        if predictions is None:
            predictions = classifier.predict(data)
        for i in range(len(submission_data)):
            csvwriter.writerow([submission_ids[i], predictions[i].strip()])   

In [54]:
this_train_data = full_train_data
this_train_labels = full_train_labels

def create_pruned_vectorizer(data, labels):
    vectorizer = CountVectorizer()
    train_docterm = vectorizer.fit_transform(data)
    total_vocab = vectorizer.vocabulary_
    total_vocab_byidx = dict((v,k) for k,v in vectorizer.vocabulary_.iteritems())

    lr_l1 = LogisticRegression(penalty='l1', tol=.01, C=1.0)
    lr_l1.fit(train_docterm, labels)

    pruned_vocab = set()
    for f in lr_l1.coef_:
        pruned_vocab.update([total_vocab_byidx[i] for i in np.flatnonzero(f)])

    pruning_vectorizer = CountVectorizer(vocabulary=pruned_vocab)
    
    return pruning_vectorizer

pruning_vectorizer = create_pruned_vectorizer(this_train_data, this_train_labels)
pruned_train_docterm = pruning_vectorizer.fit_transform(this_train_data)
pruned_test_docterm = pruning_vectorizer.transform(submission_data)

lr = LogisticRegression(penalty='l2', C=0.5)
lr.fit(pruned_train_docterm, this_train_labels)

#create_submission_csv(lr, pruned_test_docterm)

print "Results in 0.78138 accuracy on Kaggle for 710th place."

Results in 0.78138 accuracy on Kaggle for 710th place.


*Experiment - S1.*

Idea: In this experiment we add simple ingredients to the text stream for a recipe and an indicator if a recipe contains meat, seafood or animal product. 

Outcome: Some minor increase in accuracy above the baseline was achieved in our Kaggle submission.

In [69]:
simple_ingredients = ["chicken", "tomatoes", "rice", "garlic", "milk", "water", "cheese", "peanuts",
                      "beef", "mushrooms", "pork" ]  
seafood_ingredients = ["fish", "tuna", "salmon", "crab", "shrimp", "prawn", "calamari", "anchovy"]
meat_ingredients = ["beef", "steak", "chicken", "pork", "bacon", "ham",  "turkey", "meat"]
animal_product_ingredients = ["egg", "butter", "milk", "cheese"]

def add_contains_feature(ingredients, contains_feature_name, recipe):
    for ing in ingredients:
        if ing in recipe:
            recipe += " " + contains_feature_name
    return recipe

def add_simple_ingredients_features(ingredient_list):
    for ing in simple_ingredients:
        if ing in ingredient_list:
            ingredient_list += " " + ing + " "
    return ingredient_list

def add_ingredient_group_features(ingredient_list):
    ingredient_list = add_contains_feature(seafood_ingredients, "contains_seafood", ingredient_list)
    ingredient_list = add_contains_feature(meat_ingredients, "contains_meat", ingredient_list)
    # This next one decreases our accuracy a little.
    # ingredient_list = add_contains_feature(animal_product_ingredients, "contains_animal_product", ingredient_list)
    return ingredient_list

def add_experimentS1_features(ingredient_list):
    ingredient_list = add_simple_ingredients_features(ingredient_list)
    ingredient_list = add_ingredient_group_features(ingredient_list)
    return ingredient_list
    
parse_all_data(add_experimentS1_features)
this_train_data = train_data
this_train_labels = train_labels

lr = LogisticRegression(C=1.0)
vectorizer = CountVectorizer()
# achieved exact same accuracy with the pruned version of our vocabulary
#vectorizer = create_pruned_vectorizer(this_train_data, this_train_labels)
train_docterm = vectorizer.fit_transform(this_train_data)
test_docterm = vectorizer.transform(test_data)

lr.fit(train_docterm, this_train_labels)
predictions = lr.predict(test_docterm)
print np.mean(test_labels == predictions)
#create_submission_csv(lr, vectorizer.transform(submission_data))

print "Results in 0.78490 accuracy on Kaggle for 615th place when using full training set."
#0.770000502841 - highest seen on training data

0.768089706844
Results in 0.78490 accuracy on Kaggle for 615th place when using full training set.


*Experiment - S2.*

Idea: Use PCA to reduce features then train on reduced feature set.

Outcome: Because we vectorize the the ingredients list the data is very sparse. PCA does not run on a sparse matrix. Converting the sparse matrix to a dense array causes python to crash. Probably we run out of memory. Perhaps this process could be tried on a machine with more memory or perhaps PCA isn't appropriate for sparse features.

In [9]:
parse_all_data()
this_train_data = train_data
this_train_labels = train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(this_train_data)
test_docterm = vectorizer.transform(test_data)

# THIS WILL TAKE FOREVER TO RUN. I wouldn't try it.
#pca = PCA(n_components=20).fit(train_docterm.toarray())
#for k in range(1,51):
#    explained_variance = sum(pca.explained_variance_ratio_[0:k])
#    print "Variance Explained by first %s components: %s" % (k, explained_variance)

*Experiment - S3.*

Idea: Run multipe classifiers and see if a majority rules can produce better results.

Outcome: KNearest neighbor is pretty bad and slow. Decision Tree results are not great, better than a flip of a coin and much quicker than KNearest neighbors. The end result is with Logistic Regression, Multinomial Bayes and a very poor performing decision tree is worse performance than the baseline. 

In [61]:
def majority_predictions(one, two, three):
    new_predictions = []
    for i, value in enumerate(one):
        if value == two[i]:
            new_predictions.append(value)
        else:
            new_predictions.append(three[i])
    return new_predictions

    
parse_all_data()
this_train_data = train_data
this_train_labels = train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(this_train_data)
test_docterm = vectorizer.transform(test_data)

#lr = LogisticRegression(C=1.0)
lr.fit(train_docterm, this_train_labels)
lr_predictions = lr.predict(test_docterm)
print np.mean(test_labels == lr_predictions)


# find optimal alpha for multinomial naive bayes
alphas = {"alpha" : [.00001, .0001, .001, .01, .1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(MultinomialNB(), alphas, scoring='f1_micro')
grid_search.fit(train_docterm, this_train_labels)   

# Calculate f1 score for best alpha
mnb = MultinomialNB(alpha=grid_search.best_params_['alpha'])
mnb.fit(train_docterm, this_train_labels)
mnb_predictions = mnb.predict(test_docterm)
print np.mean(test_labels == mnb_predictions)

#k_values = {"n_neighbors" : range(1, 20)}
#grid_search = GridSearchCV(KNeighborsClassifier(), k_values, scoring='f1_micro')
#grid_search.fit(train_docterm, this_train_labels)
#print "best n", grid_search.best_params_['n_neighbors']
#best_n = grid_search.best_params_['n_neighbors']
#best_n = 16 

# Calculate f1 score for best k
# Two slow and poor accuracy
#knc = KNeighborsClassifier(n_neighbors=best_n)
#knc.fit(train_docterm, this_train_labels)
#predictions = knc.predict(test_docterm)
#print np.mean(test_labels == predictions)


#params = {"min_samples_split" : [2,3,4,5,6,7], "min_samples_leaf" : [1,2,3,4,5]}
#grid_search = GridSearchCV(DecisionTreeClassifier(), params, scoring='f1_micro')
#grid_search.fit(train_docterm, this_train_labels)   
#print grid_search.best_params_

dtc = DecisionTreeClassifier(min_samples_split=4, min_samples_leaf=1)
dtc.fit(train_docterm, this_train_labels)
dtc_predictions = dtc.predict(test_docterm)
print np.mean(test_labels == dtc_predictions)

predictions = majority_predictions(lr_predictions, mnb_predictions, dtc_predictions)
print "Majority rules: ", np.mean(test_labels == predictions)

submission_docterm = vectorizer.transform(submission_data)
lr_predictions = lr.predict(submission_docterm)
mnb_predictions = mnb.predict(submission_docterm)
dtc_predictions = dtc.predict(submission_docterm)
predictions = majority_predictions(lr_predictions, mnb_predictions, dtc_predictions)

create_submission_csv(predictions=predictions)

print "Results in 0.76398 accuracy on Kaggle for 851th place when using full training set."

0.763513853271
0.743400211193
0.578518630261
Majority rules:  0.740282596671
Results in 0.76398 accuracy on Kaggle for 851th place when using full training set.


*Experiment - S4.*

Idea: For many recipes the most "important" ingredient is listed first. We'll try to give extra weight to the first word or first couple of words.

Outcome: Submission accuracy is slightly worse than the baseline.

In [60]:
def duplicate_first_ingredient(ingredients_list):
    first = ingredients_list.split(" ")[0]
    return ingredients_list + " " + first
    
parse_all_data(duplicate_first_ingredient)
this_train_data = train_data
this_train_labels = train_labels

r = LogisticRegression(C=1.0)
vectorizer = create_pruned_vectorizer(this_train_data, this_train_labels)
train_docterm = vectorizer.fit_transform(this_train_data)
test_docterm = vectorizer.transform(test_data)

lr.fit(train_docterm, this_train_labels)
predictions = lr.predict(test_docterm)
print np.mean(test_labels == predictions)
create_submission_csv(lr, vectorizer.transform(submission_data))

print "Results in 0.76398 accuracy on Kaggle for 851th place when using full training set."

0.843918137477


Error analysis to try to determine what is being incorrectly guessed.