# Predicting Cuisines Using Ingredient List

Use recipe data from the What Cooking? data set [https://www.kaggle.com/c/whats-cooking/overview] from Yummly and Kaggle to predict the cuisine of a recipe.

## Hypothesis

There exist staple ingredients that can predict the cuisine of a recipe.

Examples: Rice, butter, soy souce, flour, feta cheese.

We don't want to overfit our model by predicting on rare ingredients, so we might want look into the most ingredients in the data.

In [9]:
import json
f = open('train.json')
data = json.load(f)

# TODO: Preprocess test data separately
f = open('test.json')
# data = data + (json.load(f))

print(type(data))
print(data[0])

<class 'list'>
{'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}


In [10]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [None]:
# Find unique classes and check for imbalance
value_counts = df['cuisine'].value_counts()

print("Total classes: %d" % (len(value_counts)))

num_examples = len(df.index)
print("Total examples: %d" % (num_examples))

# Store labels and counts for bar graph
labels = []
counts = []

for label, content in value_counts.items():
    print("%-*s%d\t%1.4f" % (16, label, content, (content / num_examples)))
    labels.append(label)
    counts.append(content)
    
# 14 out of the 20 features contain less than 5% of the features each
# We'll need to account for imbalance

In [None]:
# Plot recipe counts per cuisine

import matplotlib.pyplot as plt

fig = plt.figure(figsize = (30, 8))
ax = fig.add_axes([0,0,1,1])
plt.bar(labels, counts, width = .6)

ax.set_title('Recipe Counts', fontsize = 48) 
ax.set_xlabel('Cuisines', fontsize = 32)
ax.set_ylabel('Counts', fontsize = 32)
ax.tick_params(axis="x", labelsize=18)
ax.tick_params(axis="y", labelsize=24)

plt.show()

In [None]:
# Natural language processing techniques
# Porter Stemmer not too useful for breaking down ingredients to root
# Can't reduce Kraft cheese to cheese

from nltk import PorterStemmer
ps = PorterStemmer()

words = ["cheese", "cheeses", "cheesy", "Kraft Cheese", "boneless", "bone-in", "cream", "low-fat cream"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

In [None]:
# Stop words

# For recipes, remove measurements
stop_words = {"gram", " g", "kilogram", "kg", "milliliter", "ml", 
                 "ounce", "oz", "pound", "lb", "cup", "tablespoon", "tbsp", "teaspoon", "tsp",
                 "inch", "-", "%", "&"
                 }

# Remove numerical values
for num in range(1001):
    stop_words.add(str(num))

In [None]:
# Get ingredient list
ingredients_corpus = []
translated_ingredients_list = []
for recipe in data:
    # Remove spaces in list entries to preserve difference between 'grape tomatoes' and 'grape' and 'tomatoes'
    ingredients_list = [item.lower().replace(' ','') for item in recipe['ingredients'] if not any(stop_word in item for stop_word in stop_words)]
    translated_ingredients_list.append(ingredients_list)
    ingredients_str = ' '.join(ingredients_list)
    ingredients_corpus.append(ingredients_str)
    
df['translated_ingredients'] = translated_ingredients_list
    
df.head()

In [None]:
# Most common ingredients per cuisine

from collections import Counter

counter = {}

for cuisine in df['cuisine'].unique():
    counter[cuisine] = Counter()
    # Find rows belonging to the cuisine
    indices = (df['cuisine'] == cuisine)
    
    # For each row belonging to the cuisine, update counter with list of translated ingredients
    for ingredients in df[indices]['translated_ingredients']:
        counter[cuisine].update(ingredients)

topData = []
topSet = set()

for cuisine in counter.keys():
    cuisineTop = {
        'cuisine':cuisine
    }
    
    for i in range(0, 50):
        headerVal = "Top " + str(i + 1)
        topIngredient = counter[cuisine].most_common(50)[i][0]
        cuisineTop[headerVal] = topIngredient
        topSet.add(topIngredient)
        
    topData.append(cuisineTop)

topDf = pd.DataFrame(topData) 
topDf

In [None]:
# Count words in each recipe

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(dtype = 'uint8')

X = vectorizer.fit_transform(ingredients_corpus)

features = vectorizer.get_feature_names()

print(len(features))
print(features[0:50])

print(X.shape)

# Conversion to sparse matrix
vectorizer_data = X.toarray()

In [None]:
import numpy as np

# Get total counts for each ingredient across recipes
feature_counts = vectorizer_data.sum(axis = 0)
print("Feature count: %d" % (len(feature_counts)))

# Percentile of ingredients that appear in 50 or more recipes
percentile = round(sum(feature_counts < 50) / len(feature_counts), 4)
print("Percentile: %1.4f" % (percentile))

In [None]:
# Grab mask of where ingredient is used 50 times or more in data set
# Choose 50 because in the best case the ingredient shows up in ~10% of a cuisine's recipes
states = feature_counts > 50

remaining_indices = list(np.where(states == True)[0])
deleted_indices = list(np.where(states == False)[0])

features = np.array(features)
features_min50 = features[remaining_indices]

vectorizer_data_min50 = np.delete(vectorizer_data, deleted_indices, axis = 1)

print(vectorizer_data_min50.shape)
print(vectorizer_data.shape)

vectorized_df_min50 = pd.DataFrame(data = vectorizer_data_min50, columns = features_min50)
vectorized_df = pd.DataFrame(data = vectorizer_data, columns = features)

In [None]:
df = df.join(vectorized_df_min50)
df.head()

In [None]:
del df['id']
del df['ingredients']
del df['translated_ingredients']
df.shape

In [None]:
y = df['cuisine']
del df['cuisine']
X = df

In [None]:
# Logistic Regression Implementation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter = 1000)

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

log_fit = log_clf.fit(X_train, y_train)
log_pred = log_fit.predict(X_test)

In [None]:
# The model performs well on Mexican and Indian food
# Performance is poor on cuisine with fewer examples
# Performance on Italian and southern US food can be improved though they are among the most represented in the data set

from sklearn.metrics import classification_report
print(classification_report(y_test, log_pred, zero_division=0))

In [None]:
# Similar trend to logistic regression
# Worse overall
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_fit = dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print(classification_report(y_test, dt_pred, zero_division=0))

In [None]:
# Similar trends and performance to logistic regression
# Reduced n_estimators because precision was high but recall was low, implying overfit
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators = 50)
rf_fit = rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print(classification_report(y_test, rf_pred, zero_division=0))

In [None]:
# Adding support vector machine for high dimensional problem
# LinearSVC due to large data set

from sklearn.svm import LinearSVC
svc_clf = LinearSVC()
svc_fit = svc_clf.fit(X_train, y_train)
svc_pred = svc_clf.predict(X_test)
print(classification_report(y_test, svc_pred, zero_division=0))

In [None]:
# Potential Overfit
# Model could be looking at a highly specific set of ingredients
# Solutions: Reduce feature count, find alternative methods of feature engineering
# Next step: TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(ingredients_corpus)

features = vectorizer.get_feature_names()

# Conversion to sparse matrix
vectorizer_data = X.toarray()

In [None]:
features = np.array(features)
X = pd.DataFrame(data = vectorizer_data, columns = features)

In [None]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

log_clf = LogisticRegression(max_iter = 1000)

log_fit = log_clf.fit(X_train, y_train)
log_pred = log_fit.predict(X_test)

In [None]:
# Overall, slightly better scores than count vectorization
# Precision is decent but recall is bad, potential overfit?
# We can try dropping features with low TFIDF, this can help drop ingredients that are common across all cuisines

print(classification_report(y_test, log_pred, zero_division=0))

In [None]:
# Top 1000 TFIDF features
vectorizer = TfidfVectorizer(max_features = 1000)
X = vectorizer.fit_transform(ingredients_corpus)

features = vectorizer.get_feature_names()

# Conversion to sparse matrix
vectorizer_data = X.toarray()

features = np.array(features)
X = pd.DataFrame(data = vectorizer_data, columns = features)

In [None]:
# setting up testing and training sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X, y, test_size=0.25, random_state=27)

In [None]:
log_clf = LogisticRegression(max_iter = 1000)

log_fit = log_clf.fit(X_train_tfidf, y_train_tfidf)
log_pred = log_fit.predict(X_test_tfidf)

# Slightly lower scores, we might be throwing away useful data
print(classification_report(y_test_tfidf, log_pred, zero_division=0))

In [None]:
# Adding support vector machine for high dimensional problem
# LinearSVC due to large data set

svc_clf = LinearSVC()
svc_fit = svc_clf.fit(X_train_tfidf, y_train_tfidf)
svc_pred = svc_clf.predict(X_test_tfidf)
print(classification_report(y_test_tfidf, svc_pred, zero_division=0))

In [None]:
# Train on 50 most common features per cuisine
topList = list(topSet)

print(vectorized_df.shape)
vectorized_df = vectorized_df[topList]

print(vectorized_df.shape)

In [None]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(vectorized_df, y, test_size=0.25, random_state=27)

log_clf = LogisticRegression(max_iter = 1000)

log_fit = log_clf.fit(X_train, y_train)
log_pred = log_fit.predict(X_test)

print(classification_report(y_test, log_pred, zero_division=0))

In [None]:
# Adding support vector machine for high dimensional problem
# LinearSVC due to large data set

svc_clf = LinearSVC()
svc_fit = svc_clf.fit(X_train, y_train)
svc_pred = svc_clf.predict(X_test)
print(classification_report(y_test, svc_pred, zero_division=0))

In [None]:
# Performance is decent for some cuisines, but poor for others
# Implement XGBoost to better classify cuisines that the model is struggling with
# Compare between TF IDF and most common features

# XGBoost with most common features
# Similar performance to non-boosted models
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators = 100, max_depth = 3)
xgb_model.fit(X_train, y_train, eval_metric = 'mlogloss')
xgb_pred = xgb_model.predict(X_test)

print(classification_report(y_test, xgb_pred, zero_division = 0))

In [None]:
# XGBoost on TFIDF
# Call fit again, overwrite model weights, just like sklearn API [https://xgboost.readthedocs.io/en/latest/python/python_api.html]
# Similar performance

xgb_model.fit(X_train_tfidf, y_train_tfidf, eval_metric = 'mlogloss')
xgb_pred = xgb_model.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, xgb_pred, zero_division = 0))