# What's Cooking

**In this project I predict which country made this food using given ingredients**

# Data Setup

In [2]:
import pandas as pd

In [5]:
df = pd.read_json(open("train.json", "r", encoding="utf8"))

In [6]:
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [7]:
df['ingredients']

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
                               ...                        
39769    [light brown sugar, granulated sugar, butter, ...
39770    [KRAFT Zesty Italian Dressing, purple onion, b...
39771    [eggs, citrus fruit, raisins, sourdough starte...
39772    [boneless chicken skinless thigh, minced garli...
39773    [green chile, jalapeno chilies, onions, ground...
Name: ingredients, Length: 39774, dtype: object

In [None]:
df['ingredients'] = df['ingredients']

In [20]:
setim = set()

listem = list()

for i in df['ingredients']:
    listem.append(" ".join(i))
df['processed'] = listem

In [26]:
df["processed"]=df["processed"].str.lower()

df["processed"]=df["processed"].str.replace('-', '')

df["processed"]=df["processed"].str.replace("[^\w\s]","") 

df["processed"]=df["processed"].str.replace("\d+","") 

In [27]:
df['processed']

0        romaine lettuce black olives grape tomatoes ga...
1        plain flour ground pepper salt tomatoes ground...
2        eggs pepper salt mayonaise cooking oil green c...
3                           water vegetable oil wheat salt
4        black pepper shallots cornflour cayenne pepper...
                               ...                        
39769    light brown sugar granulated sugar butter warm...
39770    kraft zesty italian dressing purple onion broc...
39771    eggs citrus fruit raisins sourdough starter fl...
39772    boneless chicken skinless thigh minced garlic ...
39773    green chile jalapeno chilies onions ground bla...
Name: processed, Length: 39774, dtype: object

# Modelling

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english') 

def split_into_lemmas(text):
    
    text = str(text).lower() 
    
    words = TextBlob(text).words
    
    return [stemmer.stem(word) for word in words]

In [36]:
df['cuisine']

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object

In [38]:
x,y=df['processed'],df['cuisine']

In [39]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=80)

In [40]:
vect=CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2), analyzer=split_into_lemmas)
x_train_dtm=vect.fit_transform(x_train,y_train)
x_test_dtm=vect.transform(x_test)

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [42]:
b=MultinomialNB()
model=b.fit(x_train_dtm,y_train)
b_predict=b.predict(x_test_dtm)

In [43]:
accuracy_score(y_test,b_predict)

0.7251609010458568

In [51]:
def clearingconverting(text):
    
    text =" ".join(text)
    
    text=text.lower() 
    text=text.replace("[^\w\s]","") 
    text=text.replace("\d+","") 
    text=text.replace("\n"," ").replace("\r","") 
    
    return text

def vectorizing(text):
    
    return vect.transform([text])


In [53]:
model.predict(vectorizing(clearingconverting(df['ingredients'][5])))

array(['british'], dtype='<U12')

In [50]:
" ".join(df['ingredients'][5])

'plain flour sugar butter eggs fresh ginger root salt ground cinnamon milk vanilla extract ground ginger powdered sugar baking powder'

In [47]:
df['ingredients'][5]

['plain flour',
 'sugar',
 'butter',
 'eggs',
 'fresh ginger root',
 'salt',
 'ground cinnamon',
 'milk',
 'vanilla extract',
 'ground ginger',
 'powdered sugar',
 'baking powder']