In [1]:
import json
f = open('train.json')
data = json.load(f)

print(type(data))
print(data[0])

<class 'list'>
{'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}


In [2]:
# Get ingredient list
ingredients = []
for recipe in data:
    # Remove spaces in list entries to preserve difference between 'grape tomatoes' and 'grape' and 'tomatoes'
    ingredients_str = ' '.join([item.replace(' ','') for item in recipe['ingredients']])
    ingredients.append(ingredients_str)
        
for i in range(5):
    print(data[i]['cuisine'])
    print("Ingredient List: %s\n" % ingredients[i])

greek
Ingredient List: romainelettuce blackolives grapetomatoes garlic pepper purpleonion seasoning garbanzobeans fetacheesecrumbles

southern_us
Ingredient List: plainflour groundpepper salt tomatoes groundblackpepper thyme eggs greentomatoes yellowcornmeal milk vegetableoil

filipino
Ingredient List: eggs pepper salt mayonaise cookingoil greenchilies grilledchickenbreasts garlicpowder yellowonion soysauce butter chickenlivers

indian
Ingredient List: water vegetableoil wheat salt

indian
Ingredient List: blackpepper shallots cornflour cayennepepper onions garlicpaste milk butter salt lemonjuice water chilipowder passata oil groundcumin bonelesschickenskinlessthigh garammasala doublecream naturalyogurt bayleaf



In [3]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [4]:
# Find unique classes and check for imbalance
value_counts = df['cuisine'].value_counts()

print("Total classes: %d" % (len(value_counts)))

num_examples = len(df.index)
print("Total examples: %d" % (num_examples))

for label, content in value_counts.items():
    print("%-*s%d\t%1.4f" % (16, label, content, (content / num_examples)))
    
# 14 out of the 20 features contain less than 5% of the features each
# We'll need to account for imbalance

Total classes: 20
Total examples: 39774
italian         7838	0.1971
mexican         6438	0.1619
southern_us     4320	0.1086
indian          3003	0.0755
chinese         2673	0.0672
french          2646	0.0665
cajun_creole    1546	0.0389
thai            1539	0.0387
japanese        1423	0.0358
greek           1175	0.0295
spanish         989	0.0249
korean          830	0.0209
vietnamese      825	0.0207
moroccan        821	0.0206
british         804	0.0202
filipino        755	0.0190
irish           667	0.0168
jamaican        526	0.0132
russian         489	0.0123
brazilian       467	0.0117


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(dtype = 'uint8')

X = vectorizer.fit_transform(ingredients)

features = vectorizer.get_feature_names()

print(len(features))
print(features[0:50])

# Conversion to sparse matrix
vectorizer_data = X.toarray()

6782
['10oz', '14', '14oz', '15oz', '1inchthick', '21', '25', '2to3lb', '33', '40', '5oz', '7up', '8ounczitipasta', '95', 'abalone', 'abbamele', 'absinthe', 'aburaage', 'acaijuice', 'accent', 'accentseasoning', 'accompaniment', 'achiote', 'achiotepaste', 'achiotepowder', 'acinidipepe', 'ackee', 'acornsquash', 'actingbakingpowder', 'activ', 'activedryyeast', 'addedblackbeans', 'addeddicedtomatoes', 'adobo', 'adoboallpurposeseasoning', 'adobosauce', 'adoboseasoning', 'adobostyleseasoning', 'adzukibeans', 'agar', 'agaragarflakes', 'agavenectar', 'agavetequila', 'agedbalsamicvinegar', 'agedcheddarcheese', 'agedgouda', 'agedmanchegocheese', 'ahi', 'ahitunasteaks', 'aioli']


In [6]:
import numpy as np

# Get total counts for each ingredient across recipes
feature_counts = vectorizer_data.sum(axis = 0)
print("Feature count: %d" % (len(feature_counts)))

# Percentile of ingredients that appear in 50 or more recipes
percentile = round(sum(feature_counts < 50) / len(feature_counts), 4)
print("Percentile: %1.4f" % (percentile))

Feature count: 6782
Percentile: 0.8505


In [7]:
# Grab mask of where ingredient is used 50 times or more in data set
# Choose 50 because in the best case the ingredient shows up in ~10% of a cuisine's recipes
states = feature_counts > 50

remaining_indices = list(np.where(states == True)[0])
deleted_indices = list(np.where(states == False)[0])

features = np.array(features)
features = features[remaining_indices]

vectorizer_data = np.delete(vectorizer_data, deleted_indices, axis = 1)
print(vectorizer_data.shape)
vectorized_df = pd.DataFrame(data = vectorizer_data, columns = features)

(39774, 996)


In [8]:
df = df.join(vectorized_df)
df.head()

Unnamed: 0,id,cuisine,ingredients,activedryyeast,adobosauce,agavenectar,alfredosauce,all,allpurposeunbleachedflour,allspice,...,worcestershiresauce,yeast,yellowbellpepper,yellowcornmeal,yellowonion,yellowpeppers,yellowsquash,yoghurt,yukongoldpotatoes,zucchini
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,22213,indian,"[water, vegetable oil, wheat, salt]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
del df['id']
del df['ingredients']
df.shape

(39774, 997)

In [10]:
y = df['cuisine']
del df['cuisine']
X = df

In [11]:
# Logistic Regression Implementation

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter = 1000)

In [12]:
# Cross validation
# Commented out because it takes a while to run

# scores_log = cross_val_score(log_clf, X, y, cv = 5)
# print("Cross Validations Score for Logistic Regression\n%s\nmean: %1.4f standard deviation: %1.4f\n" % (str(scores_log), np.mean(scores_log), np.std(scores_log)))

# Overall accuracy in cross validation is 75%
# This seems like a good start, but the data is imbalanced so this number can be misleading
# We will need to review precision and recall statistics

In [13]:
# Train Test Split
from sklearn.model_selection import train_test_split

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

log_fit = log_clf.fit(X_train, y_train)
log_pred = log_fit.predict(X_test)

In [14]:
# The model performs well on Mexican and Indian food
# Performance is poor on cuisine with fewer examples
# Performance on Italian and southern US food can be improved though they are among the most represented in the data set

from sklearn.metrics import classification_report
print(classification_report(y_test, log_pred, zero_division=0))

              precision    recall  f1-score   support

   brazilian       0.58      0.38      0.46       115
     british       0.35      0.26      0.29       180
cajun_creole       0.76      0.66      0.71       397
     chinese       0.79      0.81      0.80       655
    filipino       0.67      0.54      0.60       199
      french       0.58      0.56      0.57       684
       greek       0.79      0.66      0.72       297
      indian       0.87      0.88      0.87       774
       irish       0.53      0.33      0.41       174
     italian       0.75      0.87      0.81      1903
    jamaican       0.80      0.55      0.66       141
    japanese       0.77      0.65      0.70       351
      korean       0.81      0.80      0.81       203
     mexican       0.88      0.91      0.89      1644
    moroccan       0.77      0.71      0.74       217
     russian       0.59      0.46      0.52       119
 southern_us       0.65      0.77      0.71      1038
     spanish       0.61    

In [15]:
# Similar trend to logistic regression
# Slightly worse overall
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_fit = dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print(classification_report(y_test, dt_pred, zero_division=0))

              precision    recall  f1-score   support

   brazilian       0.42      0.42      0.42       115
     british       0.21      0.22      0.21       180
cajun_creole       0.57      0.50      0.54       397
     chinese       0.60      0.65      0.63       655
    filipino       0.42      0.38      0.40       199
      french       0.39      0.40      0.40       684
       greek       0.45      0.46      0.46       297
      indian       0.72      0.72      0.72       774
       irish       0.26      0.21      0.23       174
     italian       0.68      0.71      0.69      1903
    jamaican       0.42      0.32      0.36       141
    japanese       0.52      0.52      0.52       351
      korean       0.49      0.49      0.49       203
     mexican       0.75      0.78      0.77      1644
    moroccan       0.49      0.44      0.46       217
     russian       0.33      0.29      0.31       119
 southern_us       0.53      0.55      0.54      1038
     spanish       0.30    

In [16]:
# Similar trends and performance to logistic regression
# Reduced n_estimators because precision was high but recall was low, implying overfit
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators = 50)
rf_fit = rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print(classification_report(y_test, rf_pred, zero_division=0))

              precision    recall  f1-score   support

   brazilian       0.67      0.35      0.46       115
     british       0.41      0.16      0.23       180
cajun_creole       0.77      0.59      0.67       397
     chinese       0.69      0.82      0.75       655
    filipino       0.60      0.44      0.51       199
      french       0.50      0.47      0.49       684
       greek       0.71      0.54      0.61       297
      indian       0.78      0.87      0.82       774
       irish       0.42      0.20      0.27       174
     italian       0.69      0.86      0.76      1903
    jamaican       0.85      0.33      0.48       141
    japanese       0.71      0.59      0.64       351
      korean       0.68      0.62      0.65       203
     mexican       0.79      0.89      0.84      1644
    moroccan       0.70      0.53      0.60       217
     russian       0.65      0.20      0.31       119
 southern_us       0.58      0.70      0.63      1038
     spanish       0.59    

In [17]:
# Potential Overfit
# Model could be looking at a highly specific set of ingredients
# Solutions: Reduce feature count, find alternative methods of feature engineering
# Next step: TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(ingredients)

features = vectorizer.get_feature_names()

print(len(features))
print(features[0:50])

# Conversion to sparse matrix
vectorizer_data = X.toarray()

6782
['10oz', '14', '14oz', '15oz', '1inchthick', '21', '25', '2to3lb', '33', '40', '5oz', '7up', '8ounczitipasta', '95', 'abalone', 'abbamele', 'absinthe', 'aburaage', 'acaijuice', 'accent', 'accentseasoning', 'accompaniment', 'achiote', 'achiotepaste', 'achiotepowder', 'acinidipepe', 'ackee', 'acornsquash', 'actingbakingpowder', 'activ', 'activedryyeast', 'addedblackbeans', 'addeddicedtomatoes', 'adobo', 'adoboallpurposeseasoning', 'adobosauce', 'adoboseasoning', 'adobostyleseasoning', 'adzukibeans', 'agar', 'agaragarflakes', 'agavenectar', 'agavetequila', 'agedbalsamicvinegar', 'agedcheddarcheese', 'agedgouda', 'agedmanchegocheese', 'ahi', 'ahitunasteaks', 'aioli']


In [18]:
features = np.array(features)
X = pd.DataFrame(data = vectorizer_data, columns = features)

In [19]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

log_clf = LogisticRegression(max_iter = 1000)

log_fit = log_clf.fit(X_train, y_train)
log_pred = log_fit.predict(X_test)

In [20]:
# Overall, slightly better scores than count vectorization
# Precision is decent but recall is bad, implying overfit
# We can try dropping features with low TFIDF, this can help drop ingredients that are common across all cuisines

print(classification_report(y_test, log_pred, zero_division=0))

              precision    recall  f1-score   support

   brazilian       0.78      0.35      0.48       115
     british       0.47      0.24      0.32       180
cajun_creole       0.81      0.66      0.73       397
     chinese       0.77      0.86      0.81       655
    filipino       0.75      0.52      0.62       199
      french       0.60      0.60      0.60       684
       greek       0.80      0.61      0.69       297
      indian       0.87      0.92      0.89       774
       irish       0.78      0.34      0.48       174
     italian       0.74      0.91      0.81      1903
    jamaican       0.92      0.57      0.70       141
    japanese       0.87      0.66      0.75       351
      korean       0.86      0.72      0.78       203
     mexican       0.88      0.92      0.90      1644
    moroccan       0.89      0.73      0.80       217
     russian       0.73      0.36      0.48       119
 southern_us       0.64      0.82      0.72      1038
     spanish       0.72    