In [1]:
import json
f = open('train.json')
data = json.load(f)

print(type(data))
print(data[0])

<class 'list'>
{'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}


In [2]:
# Find total number of ingredients

ingredients = []
for recipe in data:
    # Remove spaces in list entries to preserve difference between 'grape tomatoes' and 'grape' and 'tomatoes'
    ingredients_str = ' '.join([item.replace(' ','') for item in recipe['ingredients']])
    ingredients.append(ingredients_str)
    
for i in range(5):
    print(ingredients[i])

romainelettuce blackolives grapetomatoes garlic pepper purpleonion seasoning garbanzobeans fetacheesecrumbles
plainflour groundpepper salt tomatoes groundblackpepper thyme eggs greentomatoes yellowcornmeal milk vegetableoil
eggs pepper salt mayonaise cookingoil greenchilies grilledchickenbreasts garlicpowder yellowonion soysauce butter chickenlivers
water vegetableoil wheat salt
blackpepper shallots cornflour cayennepepper onions garlicpaste milk butter salt lemonjuice water chilipowder passata oil groundcumin bonelesschickenskinlessthigh garammasala doublecream naturalyogurt bayleaf


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(dtype = 'uint8')

X = vectorizer.fit_transform(ingredients)

features = vectorizer.get_feature_names()

print(len(features))
print(features[0:50])

6782
['10oz', '14', '14oz', '15oz', '1inchthick', '21', '25', '2to3lb', '33', '40', '5oz', '7up', '8ounczitipasta', '95', 'abalone', 'abbamele', 'absinthe', 'aburaage', 'acaijuice', 'accent', 'accentseasoning', 'accompaniment', 'achiote', 'achiotepaste', 'achiotepowder', 'acinidipepe', 'ackee', 'acornsquash', 'actingbakingpowder', 'activ', 'activedryyeast', 'addedblackbeans', 'addeddicedtomatoes', 'adobo', 'adoboallpurposeseasoning', 'adobosauce', 'adoboseasoning', 'adobostyleseasoning', 'adzukibeans', 'agar', 'agaragarflakes', 'agavenectar', 'agavetequila', 'agedbalsamicvinegar', 'agedcheddarcheese', 'agedgouda', 'agedmanchegocheese', 'ahi', 'ahitunasteaks', 'aioli']


In [4]:
# Conversion to sparse matrix
vectorizer_data = X.toarray()

In [5]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [6]:
# Find unique classes and check for imbalance
print("Total examples: %d" % (len(df.index)))
value_counts = df['cuisine'].value_counts()
print(value_counts)

Total examples: 39774
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64


In [7]:
import numpy as np
print(vectorizer_data.shape)

# Get total counts for each ingredient across recipes
total_counts = vectorizer_data.sum(axis = 0)
print(len(total_counts))
percentile = round(sum(total_counts < 30) / len(total_counts), 4)
print(percentile)

(39774, 6782)
6782
0.7956


In [8]:
# Grab mask of where ingredient is used 30 times or more in data set
states = total_counts > 30

remaining_indices = list(np.where(states == True)[0])
deleted_indices = list(np.where(states == False)[0])

features = np.array(features)
features = features[remaining_indices]
print(features)

vectorizer_data = np.delete(vectorizer_data, deleted_indices, axis = 1)
print(vectorizer_data.shape)
vectorized_df = pd.DataFrame(data = vectorizer_data, columns = features)
vectorized_df

['activedryyeast' 'adobosauce' 'agavenectar' ... 'yoghurt'
 'yukongoldpotatoes' 'zucchini']
(39774, 1358)


Unnamed: 0,activedryyeast,adobosauce,agavenectar,alfredosauce,all,allpurposeunbleachedflour,allspice,allspiceberries,almondextract,almondflour,...,yellowbellpepper,yellowcornmeal,yellowmustard,yellowmustardseeds,yellowonion,yellowpeppers,yellowsquash,yoghurt,yukongoldpotatoes,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39772,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df = df.join(vectorized_df)
df.head()

Unnamed: 0,id,cuisine,ingredients,activedryyeast,adobosauce,agavenectar,alfredosauce,all,allpurposeunbleachedflour,allspice,...,yellowbellpepper,yellowcornmeal,yellowmustard,yellowmustardseeds,yellowonion,yellowpeppers,yellowsquash,yoghurt,yukongoldpotatoes,zucchini
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,22213,indian,"[water, vegetable oil, wheat, salt]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
del df['id']
del df['ingredients']
df.shape

(39774, 1359)

In [11]:
y = df['cuisine']
del df['cuisine']
X = df

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter = 1000)
scores_log = cross_val_score(log_clf, X, y, cv = 5)
print("Cross Validations Score for Logistic Regression\n%s\nmean: %1.4f standard deviation: %1.4f\n" % (str(scores_log), np.mean(scores_log), np.std(scores_log)))

Cross Validations Score for Logistic Regression
[0.75587681 0.76002514 0.76115651 0.75600251 0.76602967]
mean: 0.7598 standard deviation: 0.0038

