Supporting text

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

Load Data

In [None]:
df_train = pd.read_json("../input/train.json")
df_test = pd.read_json("../input/test.json")

In [None]:
print('Shape of train data',df_train.shape)
print('Shape of test data',df_test.shape)

In [None]:
df_train['number_ingredients'] = df_train['ingredients'].apply(lambda x: len(x))
df_test['number_ingredients'] = df_test['ingredients'].apply(lambda x: len(x))

In [None]:
df_train.head()

In [None]:
df_train.tail()

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Missing Total', 'Missing %'])
missing_data.head(20)

In [None]:
print("Recipes with 2 or less ingredients:", len(df_train[df_train['number_ingredients'] <= 2]))

In [None]:
df_train = df_train[df_train['number_ingredients'] > 2]

In [None]:
cuisine_count = df_train['cuisine'].value_counts()
cuisine_count

In [None]:
sns.set(style="whitegrid")
plt.subplots(figsize=(25,15))
ax = sns.barplot(cuisine_count.index, cuisine_count.values)

In [None]:
ingredient_count = Counter([item for sublist in df_train.ingredients for item in sublist]).most_common()

In [None]:
sns.set(style="whitegrid")
plt.subplots(figsize=(50,15))
df = pd.DataFrame(ingredient_count[:25], columns=['ingredient', 'frequency'])
df.head()
ax = sns.barplot('ingredient', 'frequency', data=df)


In [None]:
df_ingr_by_cuisine = df_train[df_train['cuisine'] == 'italian']
ingredient_count = Counter([item for sublist in df_ingr_by_cuisine.ingredients for item in sublist]).most_common()
sns.set(style="whitegrid")
plt.subplots(figsize=(50,15))
df = pd.DataFrame(ingredient_count[:25], columns=['ingredient', 'frequency'])
df.head()
ax = sns.barplot('ingredient', 'frequency', data=df)
ax.set_title('Top Italian Ingredients')

In [None]:
df_ingr_by_cuisine = df_train[df_train['cuisine'] == 'chinese']
ingredient_count = Counter([item for sublist in df_ingr_by_cuisine.ingredients for item in sublist]).most_common()
sns.set(style="whitegrid")
plt.subplots(figsize=(50,15))
df = pd.DataFrame(ingredient_count[:25], columns=['ingredient', 'frequency'])
df.head()
ax = sns.barplot('ingredient', 'frequency', data=df)
ax.set_title('Top Chinese Ingredients')

In [None]:
df_ingr_by_cuisine = df_train[df_train['cuisine'] == 'mexican']
ingredient_count = Counter([item for sublist in df_ingr_by_cuisine.ingredients for item in sublist]).most_common()
sns.set(style="whitegrid")
plt.subplots(figsize=(50,15))
df = pd.DataFrame(ingredient_count[:25], columns=['ingredient', 'frequency'])
ax = sns.barplot('ingredient', 'frequency', data=df)
ax.set_title('Top Mexican Ingredients')

In [None]:
df_ingr_by_cuisine = df_train[df_train['cuisine'] == 'indian']
ingredient_count = Counter([item for sublist in df_ingr_by_cuisine.ingredients for item in sublist]).most_common()
sns.set(style="whitegrid")
plt.subplots(figsize=(50,15))
df = pd.DataFrame(ingredient_count[:25], columns=['ingredient', 'frequency'])
ax = sns.barplot('ingredient', 'frequency', data=df)
ax.set_title('Top Indian Ingredients')

In [None]:
def transform(ingredients):
    ingredients_text = ' '.join(ingredients)
    ingredients_text = ingredients_text.lower()
    ingredients_text = ingredients_text.replace('-', '')
    return ingredients_text

In [None]:
df_train['x'] = df_train['ingredients'].apply(lambda ingredients: transform(ingredients))
df_test['x'] = df_test['ingredients'].apply(lambda ingredients: transform(ingredients))

In [None]:
tfidf = TfidfVectorizer(binary=True)
def tfidf_features(txt, flag):
    if flag == "train":
    	x = tfidf.fit_transform(txt)
    else:
	    x = tfidf.transform(txt)
    x = x.astype('float16')
    return x 
x_train = tfidf_features(df_train['x'], flag="train")
x_test = tfidf_features(df_test['x'], flag="test")


In [None]:
# Label Encoding - Target 
lb = LabelEncoder()
df_train['cuisine'] = lb.fit_transform(df_train.cuisine.values)

In [None]:
m = RandomForestClassifier(oob_score=True)
m.fit(x_train, df_train['cuisine'])

In [None]:
y_test = m.predict(x_test)
y_pred = lb.inverse_transform(y_test)

In [None]:
df_sub = pd.DataFrame(np.array([df_test.id, y_pred]).T, 
                      columns=['id', 'cuisine']).set_index('id')

df_sub.head()

In [None]:
df_sub.to_csv('submission.csv')