In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from numpy import array
from sklearn.linear_model import LogisticRegression

In [4]:
cuisine_train_dataframe = pd.read_json('train.json')

feature_cols = ['ingredients']

X = cuisine_train_dataframe[feature_cols]

# convert array of strings for each json object to array of 1's and zeros by finding the number of unique ingredients 
#in all arrays 
# and creating that many columns and looping through all strings in the arrays and finding their index and setting it to 1

keys = set()

for ing_list in X['ingredients']:
    for ing in ing_list:
        keys.add(ing)
        
# create list of same size as X of lists of zeros of size n where n = the number of unique keys         
encoded = np.zeros(len(X) * len(keys)).reshape((len(X), len(keys)))

key_list = list(keys)

my_index = 0

for l in X['ingredients']:
    for item in l:
        index = key_list.index(item)
        encoded[my_index][index] = 1
    my_index += 1
    
X = np.array(encoded)

print("the first json object has total ingredients: ")
np.count_nonzero(X[0] == 1)
# 9 columns are set to 1 


the first json object has total ingredients: 


9

# split into 6% testing, 94% training, the higher the testing percentage the lower the accuracy

In [5]:
# label vector
y = cuisine_train_dataframe['cuisine'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=1)

logReg =  LogisticRegression()

logReg.fit(X_train, y_train)

y_predict = logReg.predict(X_test)

print(y_predict)
print(y_test) 

accuracy = accuracy_score(y_test, y_predict)

print("\nAccuracy Logistic Regression: ", accuracy)

print("\nAccuracy with cross validation: ")

my_logreg = LogisticRegression()

accuracy_list = cross_val_score(my_logreg, X, y, cv=10, scoring='accuracy')

# use average of accuracy values as final result
accuracy_cv = accuracy_list.mean()

print(accuracy_cv)






['mexican' 'indian' 'indian' ..., 'thai' 'indian' 'british']
25474         mexican
19761          indian
32740          indian
34148          french
5218      southern_us
17371         mexican
11160          french
2445          mexican
27584         spanish
4545          mexican
18623        japanese
17605         mexican
34016         british
33127         chinese
23296         chinese
5254            greek
18629        filipino
20863     southern_us
6095           french
25944      vietnamese
11350     southern_us
17796         italian
4274           indian
22648         italian
10586         mexican
11518         italian
28809         british
20713         chinese
7550          italian
14037         chinese
             ...     
22463        japanese
21739        moroccan
14737     southern_us
33252         italian
17184         chinese
33385         mexican
22021         italian
26956         italian
19005         italian
28907         italian
13777         mexican
25863         m

# prediction probabilities

In [9]:
print("Probabilities of predictions")
y_predict_proba = logReg.predict_proba(X_test)
print(y_predict_proba)

Probabilities of predictions
[[  1.14431737e-03   4.65203069e-03   1.76458699e-03 ...,   8.29683823e-03
    2.28223968e-03   1.51682870e-02]
 [  4.86825298e-03   5.82090315e-04   1.18116979e-04 ...,   1.21459255e-03
    1.75934812e-03   1.11607164e-03]
 [  3.78464448e-02   4.24064323e-02   1.06920420e-02 ...,   5.68477649e-03
    1.08677610e-02   9.85815869e-03]
 ..., 
 [  5.76458549e-03   2.78901286e-03   3.57654118e-03 ...,   2.31651597e-03
    7.41208456e-01   3.81310815e-02]
 [  2.27702388e-03   6.78886036e-03   2.85089419e-03 ...,   4.20931065e-03
    2.72207628e-03   3.01935004e-03]
 [  3.65817596e-03   3.29216960e-01   7.27538543e-03 ...,   4.63288718e-02
    3.88813384e-03   2.19082741e-03]]


# confusion matrix

In [10]:
from sklearn import metrics

print("confusion matrix")

print(metrics.classification_report(y_test, y_predict))

metrics.confusion_matrix(y_test, y_predict)

confusion matrix
              precision    recall  f1-score   support

   brazilian       0.91      0.48      0.62        21
     british       0.58      0.43      0.49        44
cajun_creole       0.80      0.79      0.79        75
     chinese       0.84      0.85      0.84       142
    filipino       0.70      0.68      0.69        31
      french       0.66      0.56      0.61       147
       greek       0.77      0.68      0.72        59
      indian       0.84      0.94      0.89       162
       irish       0.86      0.60      0.70        42
     italian       0.76      0.92      0.83       399
    jamaican       0.91      0.74      0.82        27
    japanese       0.87      0.67      0.76        70
      korean       0.95      0.67      0.78        54
     mexican       0.91      0.93      0.92       349
    moroccan       0.91      0.78      0.84        41
     russian       0.67      0.40      0.50        20
 southern_us       0.72      0.83      0.77       223
     spani

array([[ 10,   0,   0,   0,   1,   0,   0,   1,   0,   4,   0,   0,   0,
          4,   0,   0,   1,   0,   0,   0],
       [  0,  19,   0,   0,   0,   9,   1,   2,   2,   2,   1,   0,   0,
          1,   0,   1,   6,   0,   0,   0],
       [  0,   0,  59,   1,   0,   1,   0,   1,   0,   3,   0,   0,   0,
          1,   0,   0,   9,   0,   0,   0],
       [  0,   0,   0, 120,   0,   0,   0,   2,   0,   3,   0,   3,   2,
          5,   0,   0,   2,   0,   5,   0],
       [  0,   0,   1,   2,  21,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   3,   1,   0,   2],
       [  0,   3,   2,   1,   0,  83,   2,   0,   0,  36,   1,   0,   0,
          0,   0,   0,  18,   1,   0,   0],
       [  0,   0,   0,   0,   0,   3,  40,   0,   0,  15,   0,   0,   0,
          0,   0,   0,   1,   0,   0,   0],
       [  0,   1,   0,   0,   2,   1,   0, 152,   0,   0,   0,   0,   0,
          4,   1,   0,   1,   0,   0,   0],
       [  0,   5,   0,   0,   0,   4,   0,   1,  25,   6,   0,  