In [64]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import json
import csv
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [56]:
with open('train.json') as json_train_data:
    train_raw = json.load(json_train_data)
    
with open('test.json') as json_test_data:
    test_raw = json.load(json_test_data)
    
train_labels = []
train_data = []

for recipe in train_raw:
    train_labels.append(recipe["cuisine"])
    ingredient_list = " ".join([x.replace(" ","_") for x in recipe["ingredients"]])
    ingredient_list = re.sub(r'[^A-Za-z\s_]', '', ingredient_list)
    ingredient_list = re.sub(r'_+oz_', '', ingredient_list)
    ingredient_list = re.sub(r' _+', '', ingredient_list)
    train_data.append(ingredient_list)

target_names = list(set(train_labels))
train_labels = np.array(train_labels)

test_ids = []
test_data = []

for recipe in test_raw:
    test_ids.append(recipe["id"])
    ingredient_list = " ".join([x.replace(" ","_") for x in recipe["ingredients"]])
    ingredient_list = re.sub(r'[^A-Za-z\s_]', '', ingredient_list)
    ingredient_list = re.sub(r'_+oz_', '', ingredient_list)
    ingredient_list = re.sub(r' _+', '', ingredient_list)
    test_data.append(ingredient_list)

print target_names
print train_labels.shape

mini_train_data = train_data[:7000]
mini_train_labels = train_labels[:7000]

[u'irish', u'mexican', u'chinese', u'filipino', u'vietnamese', u'moroccan', u'brazilian', u'japanese', u'british', u'greek', u'indian', u'jamaican', u'french', u'spanish', u'russian', u'cajun_creole', u'thai', u'southern_us', u'korean', u'italian']
(39774,)


In [50]:
train_data = mini_train_data
train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(train_data)

features = vectorizer.get_feature_names()

print sorted(features)[0:10]

[u'_lowfat_cottage_cheese', u'_lowfat_milk', u'_reducedfat_milk', u'a_taste_of_thai_rice_noodles', u'aai', u'abalone', u'abbamele', u'accent', u'accent_seasoning', u'achiote']


In [51]:
train_data = mini_train_data
train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(train_data)

c = pow(2.0,np.arange(6)-6)
parameters = {'C': c}
lr = LogisticRegression()
lr_clf = GridSearchCV(lr,parameters,scoring='accuracy')
lr_clf.fit(train_docterm, train_labels)
print lr_clf.best_score_

#alpha = pow(2.0,np.arange(24)-12)
#parameters = {'alpha': alpha}
#mnb = MultinomialNB()
#mnb_clf = GridSearchCV(mnb,parameters,scoring='accuracy')
#mnb_clf.fit(train_docterm, train_labels)
#print mnb_clf.best_score_

0.696142857143


In [53]:
train_data = mini_train_data
train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(train_data)
total_vocab = vectorizer.vocabulary_
total_vocab_byidx = dict((v,k) for k,v in vectorizer.vocabulary_.iteritems())

vocab_sizes = []
accuracies = []

for c_l1 in pow(2.0,np.arange(12)-6):

    # Train LogisticRegression with L1 penalty and specific C
    lr_l1 = LogisticRegression(penalty='l1', tol=.01, C=c_l1)
    lr_l1.fit(train_docterm, train_labels)

    # Create a pruned vocabulary based on non-zero features from LogisticRegression with L1 penalty
    pruned_vocab = set()
    for f in lr_l1.coef_:
        pruned_vocab.update([total_vocab_byidx[i] for i in np.flatnonzero(f)])

    # Create a pruning vectorizer
    pruning_vectorizer = CountVectorizer(vocabulary=pruned_vocab)
    pruned_train_docterm = pruning_vectorizer.fit_transform(train_data)

    # Train LogisticRegression with L2 penalty and pruned vocabulary
    c = pow(2.0,np.arange(12)-12)
    parameters = {'penalty' : ['l2'], 'C': c}
    lr = LogisticRegression()
    lr_clf = GridSearchCV(lr,parameters,scoring='accuracy')
    lr_clf.fit(pruned_train_docterm, train_labels)
    c_l2 = lr_clf.best_params_['C']

    print "L1 C: %10.5f  Vocabulary Size: %5d  L2 Accuracy: %.5f  L2 C: %10.5f" % (c_l1, len(pruned_vocab), lr_clf.best_score_, c_l2)

L1 C:    0.01562  Vocabulary Size:    33  L2 Accuracy: 0.45657  L2 C:    0.50000
L1 C:    0.03125  Vocabulary Size:    72  L2 Accuracy: 0.54571  L2 C:    0.50000
L1 C:    0.06250  Vocabulary Size:   159  L2 Accuracy: 0.62743  L2 C:    0.50000
L1 C:    0.12500  Vocabulary Size:   293  L2 Accuracy: 0.66914  L2 C:    0.50000
L1 C:    0.25000  Vocabulary Size:   502  L2 Accuracy: 0.69057  L2 C:    0.50000
L1 C:    0.50000  Vocabulary Size:   800  L2 Accuracy: 0.69886  L2 C:    0.50000
L1 C:    1.00000  Vocabulary Size:  1250  L2 Accuracy: 0.70114  L2 C:    0.50000
L1 C:    2.00000  Vocabulary Size:  2198  L2 Accuracy: 0.69843  L2 C:    0.50000
L1 C:    4.00000  Vocabulary Size:  2505  L2 Accuracy: 0.69643  L2 C:    0.50000
L1 C:    8.00000  Vocabulary Size:  2685  L2 Accuracy: 0.69714  L2 C:    0.50000
L1 C:   16.00000  Vocabulary Size:  2850  L2 Accuracy: 0.69629  L2 C:    0.50000
L1 C:   32.00000  Vocabulary Size:  3030  L2 Accuracy: 0.69614  L2 C:    0.50000


In [76]:
train_data = mini_train_data
train_labels = mini_train_labels

vectorizer = CountVectorizer()
train_docterm = vectorizer.fit_transform(train_data)
total_vocab = vectorizer.vocabulary_
total_vocab_byidx = dict((v,k) for k,v in vectorizer.vocabulary_.iteritems())

lr_l1 = LogisticRegression(penalty='l1', tol=.01, C=1.0)
lr_l1.fit(train_docterm, train_labels)

pruned_vocab = set()
for f in lr_l1.coef_:
    pruned_vocab.update([total_vocab_byidx[i] for i in np.flatnonzero(f)])

pruning_vectorizer = CountVectorizer(vocabulary=pruned_vocab)
pruned_train_docterm = pruning_vectorizer.fit_transform(train_data)
pruned_test_docterm = pruning_vectorizer.transform(test_data)

lr = LogisticRegression(penalty='l2', C=0.5)
lr.fit(pruned_train_docterm, train_labels)

with open('submission.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['id', 'cuisine'])
    for i in range(len(test_data)):
        csvwriter.writerow([test_ids[i], " ".join(lr.predict(pruned_test_docterm[i])).strip()])
