In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from stemmer import porter_stemmer,snowball_stemmer,Lancaster_stemmer
from baseline_rate import Base_accuracy



a.       You'll need to  create your test set using stratified sampling, to preserve the distribution of recipes falling into different classes

b.       You may also want to consider the cooking method as a feature. For instance, Chinese cooking rarely (if ever?) requires baking.

c.       You may want to incorporate a cost matrix to assign different penalties for wrongly classifying a recipe as coming from a cuisine that is 'very different' from it's actual cuisine.  

d.       What would the misclassification rate be if you were to randomly assign a recipe to a cuisine, according to the proportion of recipes in the training data that fell into that cuisine? This would be a good baseline against which to compare the performance of your models.

In [2]:
train_set = pd.read_json('/Users/guozhiqi-seven/Downloads/train.json') 
train_set.shape

(39774, 3)

In [3]:
test_set = pd.read_json('/Users/guozhiqi-seven/Downloads/test.json') 
test_set.shape

(9944, 2)

In [4]:
print(type(train_set['ingredients'])) 
train_set.head() 

<class 'pandas.core.series.Series'>


Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
#Totally 20 types of cusines
len(train_set['cuisine'].unique()) 

20

In [6]:
# distribution of cuisine in dataset
original_dist = (train_set['cuisine'].value_counts() / len(train_set)).to_frame();original_dist

Unnamed: 0,cuisine
italian,0.197063
mexican,0.161865
southern_us,0.108614
indian,0.075502
chinese,0.067205
french,0.066526
cajun_creole,0.03887
thai,0.038694
japanese,0.035777
greek,0.029542


In [7]:
train_set['cuisine'].unique()  

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican', 'spanish',
       'italian', 'mexican', 'chinese', 'british', 'thai', 'vietnamese',
       'cajun_creole', 'brazilian', 'french', 'japanese', 'irish',
       'korean', 'moroccan', 'russian'], dtype=object)

In [8]:
train_set['ingredients'].head()

0    [romaine lettuce, black olives, grape tomatoes...
1    [plain flour, ground pepper, salt, tomatoes, g...
2    [eggs, pepper, salt, mayonaise, cooking oil, g...
3                  [water, vegetable oil, wheat, salt]
4    [black pepper, shallots, cornflour, cayenne pe...
Name: ingredients, dtype: object

In [9]:
ingredients = train_set['ingredients']
ingredients_list = [' '.join(x) for x in ingredients] 

In [10]:
#glimpse of ingredients
ingredients_list[:5]

['romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles',
 'plain flour ground pepper salt tomatoes ground black pepper thyme eggs green tomatoes yellow corn meal milk vegetable oil',
 'eggs pepper salt mayonaise cooking oil green chilies grilled chicken breasts garlic powder yellow onion soy sauce butter chicken livers',
 'water vegetable oil wheat salt',
 'black pepper shallots cornflour cayenne pepper onions garlic paste milk butter salt lemon juice water chili powder passata oil ground cumin boneless chicken skinless thigh garam masala double cream natural yogurt bay leaf']

## Bag of words
* One might using different stemming method by calling porter_stemmer,snowball_stemmer,Lancaster_stemmer

In [14]:
porter_ingredient = porter_stemmer(ingredients_list)  
snowball_ingredient = snowball_stemmer(ingredients_list) 

In [13]:
#ingredient after porter stemming
porter_ingredient[:5] 

['romain lettuc black oliv grape tomato garlic pepper purpl onion season garbanzo bean feta chees crumbl',
 'plain flour ground pepper salt tomato ground black pepper thyme egg green tomato yellow corn meal milk veget oil',
 'egg pepper salt mayonais cook oil green chili grill chicken breast garlic powder yellow onion soy sauc butter chicken liver',
 'water veget oil wheat salt',
 'black pepper shallot cornflour cayenn pepper onion garlic past milk butter salt lemon juic water chili powder passata oil ground cumin boneless chicken skinless thigh garam masala doubl cream natur yogurt bay leaf']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english',lowercase = True,ngram_range=(1,3))
X_train_tdm = count_vect.fit_transform(ingredients_list) 
X_train_tdm.shape 

(39774, 328053)

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split( 
    X_train_tdm,train_set['cuisine'], test_size=0.25, random_state=0) 

In [20]:
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train) 
y_valid = labelencoder.transform(y_valid) 

### Stratified sampling, to preserve the distribution of recipes falling into different classes

In [26]:
train_distribution = (pd.Series(labelencoder.inverse_transform(y_train)).value_counts()/ len(y_train)).to_frame()
test_distribution = (pd.Series(labelencoder.inverse_transform(y_valid)).value_counts() / len(y_valid)).to_frame()
cuisine_dist = (pd.concat([train_distribution, test_distribution,original_dist], axis=1))
cuisine_dist.columns = ['train','test','original'] ;cuisine_dist[:10]

Unnamed: 0,train,test,original
brazilian,0.011633,0.012068,0.011741
british,0.020315,0.019912,0.020214
cajun_creole,0.039692,0.036404,0.03887
chinese,0.066276,0.069992,0.067205
filipino,0.019242,0.018202,0.018982
french,0.066644,0.066171,0.066526
greek,0.029266,0.03037,0.029542
indian,0.074422,0.078741,0.075502
irish,0.01703,0.01599,0.01677
italian,0.197989,0.194288,0.197063


In [24]:
from sklearn.linear_model import LogisticRegression
C_penalty = [10**i for i in range(-3,3)]
for C_para in C_penalty:
    LogReg = LogisticRegression(C=C_para)
    LogReg = LogReg.fit(X_train,y_train) 
    print('C = ',C_para, 'Score:',LogReg.score(X_valid, y_valid) ) 

C =  0.001 Score: 0.595032180209
C =  0.01 Score: 0.7252614642
C =  0.1 Score: 0.783688656476
C =  1 Score: 0.791130329847
C =  10 Score: 0.786403861625
C =  100 Score: 0.781174577635


### LR performance using porter stemming

In [27]:
X_train_tdm_porter = count_vect.fit_transform(porter_ingredient) 
X_train_tdm_porter.shape  

(39774, 314884)

In [29]:
X_train_porter, X_valid_porter, y_train_porter, y_valid_porter = train_test_split( 
    X_train_tdm_porter,train_set['cuisine'], test_size=0.25, random_state=0)
y_train_porter = labelencoder.fit_transform(y_train)
y_valid_porter = labelencoder.transform(y_valid)

In [30]:
for C_para in C_penalty:
    LogReg = LogisticRegression(C=C_para)
    LogReg = LogReg.fit(X_train_porter,y_train_porter)
    print('C = ',C_para, 'Score:',LogReg.score(X_valid_porter, y_valid_porter) ) 

C =  0.001 Score: 0.595534995977
C =  0.01 Score: 0.726870474658
C =  0.1 Score: 0.784593724859
C =  1 Score: 0.788817377313
C =  10 Score: 0.785800482703
C =  100 Score: 0.781275140788


In [138]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf_NB = MultinomialNB(alpha=0.01, fit_prior = True)
clf_NB.fit(X_train, y_train) 

predict_nb = clf_NB.predict(X_valid) 
print (metrics.accuracy_score(y_valid, predict_nb)) 

0.730893000805


In [139]:
from sklearn.svm import SVC
svc_clf = SVC(kernel='linear')
svc_clf.fit(X_train, y_train)
svc_clf.score(X_valid, y_valid)

0.7712188254223652

## TF-IDF

In [31]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tdm)
X_train_tfidf.shape 

(39774, 328053)

In [32]:
X_train_tfidf, X_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split( 
    X_train_tfidf,train_set['cuisine'], test_size=0.25, random_state=0) 

In [33]:
labelencoder = LabelEncoder()
y_train_tfidf = labelencoder.fit_transform(y_train)
y_valid_tfidf = labelencoder.transform(y_valid) 

In [34]:
C_penalty = [10**i for i in range(-3,3)]
for C_para in C_penalty:
    LogReg = LogisticRegression(C=C_para)
    LogReg = LogReg.fit(X_train_tfidf,y_train_tfidf)
    print('C = ',C_para, 'Score:',LogReg.score(X_valid_tfidf, y_valid_tfidf) ) 

C =  0.001 Score: 0.194288012872
C =  0.01 Score: 0.321198712792
C =  0.1 Score: 0.567880128721
C =  1 Score: 0.731798069187
C =  10 Score: 0.782280772325
C =  100 Score: 0.787208366854


### TF-IDF Porter

In [37]:
X_train_tfidf_porter = tfidf_transformer.fit_transform(X_train_tdm_porter)
X_train_tfidf_porter.shape 
X_train_tfidf_porter, X_valid_tfidf_porter, y_train_tfidf_porter, y_valid_tfidf_porter = train_test_split( 
    X_train_tfidf_porter,train_set['cuisine'], test_size=0.25, random_state=0) 
y_train_tfidf = labelencoder.fit_transform(y_train_tfidf_porter)
y_valid_tfidf = labelencoder.transform(y_valid_tfidf_porter) 

In [38]:
for C_para in C_penalty:
    LogReg = LogisticRegression(C=C_para)
    LogReg = LogReg.fit(X_train_tfidf_porter,y_train_tfidf_porter)
    print('C = ',C_para, 'Score:',LogReg.score(X_valid_tfidf_porter, y_valid_tfidf_porter) ) 

C =  0.001 Score: 0.194288012872
C =  0.01 Score: 0.324215607401
C =  0.1 Score: 0.570293644409
C =  1 Score: 0.734111021722
C =  10 Score: 0.781777956557
C =  100 Score: 0.786102172164


### Misclassification rate   
* For one were to randomly assign a recipe to a cuisine according to the distribution of recipes in the training data, misclassification rate is as high as 0.9, and baseline accuracy is around 0.1, which is much lower than the accuracy of the training model.

In [6]:
base_accuracy = Base_accuracy(train_set)
print('Baseline accuracy is:', base_accuracy) 
print('Misclassification rate:',1-base_accuracy)

Baseline accuracy is: 0.10100065369336753
Misclassification rate: 0.8989993463066325
