kaggle - What's Cooking
--------

Code stolen from:

1. https://www.kaggle.com/c/whats-cooking/forums/t/16421/kaggle-scripts/92517


In [1]:
import json
import numpy as np
from sklearn import metrics

In [2]:
with open('data/train.json') as train_f, open('data/test.json') as test_f:
    train_data = json.load(train_f)
    test_data = json.load(test_f)
    
train_X = [' '.join(e['ingredients']) for e in train_data]
train_y = [e['cuisine'] for e in train_data]
test_X =  [' '.join(e['ingredients']) for e in test_data]
test_id = [e['id'] for e in test_data]  # for writing CSV

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

le = LabelEncoder()
ngram_vectorizer = CountVectorizer()
train_y = le.fit_transform(train_y)
train_X = ngram_vectorizer.fit_transform(train_X).toarray()
test_X = ngram_vectorizer.transform(test_X).toarray()

valid_X = np.empty((0, train_X.shape[1]))
valid_y = np.empty(0)

In [4]:
print train_X.shape
print train_y.shape
print valid_X.shape
print valid_y.shape
print test_X.shape

(39774L, 3010L)
(39774L,)
(0L, 3010L)
(0L,)
(9944L, 3010L)


In [5]:
train_X = np.vstack([train_X, valid_X])
train_y = np.hstack([train_y, valid_y])

train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, train_size=0.7, random_state=42)

In [6]:
print train_X.shape, valid_X.shape, train_y.shape, valid_y.shape

(27841L, 3010L) (11933L, 3010L) (27841L,) (11933L,)


In [7]:
from sklearn.linear_model import LogisticRegression
clf_log_reg = LogisticRegression(C=1, multi_class='ovr')
clf_log_reg.fit(train_X, train_y)

valid_y_pred = clf_log_reg.predict(valid_X)
print metrics.accuracy_score(valid_y, valid_y_pred)

0.781614011565


In [8]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
clf_svm = OneVsRestClassifier(LinearSVC(C=1, penalty='l2', dual=False))
clf_svm.fit(train_X, train_y)

valid_y_pred = clf_svm.predict(valid_X)
print metrics.accuracy_score(valid_y, valid_y_pred)

0.77264728065


In [9]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(loss='log', penalty='elasticnet', alpha=0.00001, l1_ratio=0.55, n_iter=100, shuffle=True, 
                        epsilon=1e-2, n_jobs=4, learning_rate='optimal', eta0=0.0)
clf_sgd.fit(train_X, train_y)

valid_y_pred = clf_sgd.predict(valid_X)
print metrics.accuracy_score(valid_y, valid_y_pred)

0.772814883097


In [10]:
# Final submission using LR
from pandas import DataFrame
from collections import OrderedDict
clf = LogisticRegression(C=1, multi_class='ovr')
clf.fit(np.vstack([train_X, valid_X]), np.hstack([train_y, valid_y]))

test_y = clf.predict(test_X)
test_y = le.inverse_transform(test_y.astype(int))

d = DataFrame(data=OrderedDict([('id', test_id), ('cuisine', test_y)]))
d.to_csv('submission.csv', index=False)

Test XGBoost
--------

In [11]:
import xgboost as xgb
xgb_train = xgb.DMatrix(train_X, label=train_y)
xgb_valid = xgb.DMatrix(valid_X, label=valid_y)

In [12]:
param = {
    'objective': 'multi:softmax',
    'eta': 0.1,  # weight shrinkage
    'max_depth': 36,
    'silent': 1,
    'subsample': 0.75,
    'colsample_bytree': 0.5,  # what's this?
}

param['nthread'] = 8
param['num_class'] = len(le.classes_)

watchlist = [ (xgb_train,'train'), (xgb_valid, 'valid') ]
n_rounds = 150
bst = xgb.train(param, xgb_train, n_rounds, watchlist, early_stopping_rounds=80);
print "XGBoost training done!"

valid_y_pred = bst.predict(xgb_valid);
print metrics.accuracy_score(valid_y, valid_y_pred)

In [13]:
from pandas import DataFrame
from collections import OrderedDict

test_y = bst.predict(xgb.DMatrix(test_X))
test_y = le.inverse_transform(test_y.astype(int))

d = DataFrame(data=OrderedDict([('id', test_id), ('cuisine', test_y)]))
d.to_csv('submission.csv', index=False)