# Predict flavor profile based on ingredients

Here we try to predict the flavor profile of the dish based on its ingredients. Flavor profiles can be 
* spicy
* sweet
This would make it a binary classification.

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns  # noqa
from matplotlib import pyplot as plt  # noqa

In [None]:
df = pd.read_csv("dataset/indian_food.csv", na_values=-1)
df.info()

In [None]:
# (~pd.isna(df["flavor_profile"])).value_counts()
filter_cond = (~pd.isna(df["flavor_profile"])) & (df.flavor_profile.isin(["spicy", "sweet"]))
df = df.loc[filter_cond, ["name", "ingredients", "flavor_profile"]]
df.shape

In [None]:
df.columns
df.head(5)
df.flavor_profile.unique()

In [None]:
# A lot more preprocessing on the ingredients need to be done
# For instance green chilli and green chillies are two different
# ingredients as of now.
all_ingredients = df.ingredients.str.lower().str.split(",").explode().str.strip().unique()
all_ingredients.shape

In [None]:
from sklearn.model_selection import train_test_split

X = df.loc[:, ["name", "ingredients"]].values
y = df.loc[:, "flavor_profile"].values

X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train.shape, X_test.shape

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class IngredientsEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categories):
        self.categories = categories
        self._category_indices = {c: i for i, c in enumerate(categories)}

    def fit(self, X, y):
        return self

    def transform(self, X):
        result = np.zeros(shape=(X.shape[0], len(self.categories)))

        for i, r in enumerate(X):
            for ing in r[1].lower().split(","):
                ing_pos = self._category_indices[ing.strip()]
                result[i, ing_pos] = 1
        return result

In [None]:
ing_encoder = IngredientsEncoder(all_ingredients)
ing_encoder.fit(X_train, y_train)
X_train_trans = ing_encoder.transform(X_train)
X_train.shape, X_train_trans.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

flavor_profile_encoder = LabelEncoder()
y_train_trans = flavor_profile_encoder.fit_transform(y_train)
flavor_profile_encoder.classes_

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict

gb_clf = GradientBoostingClassifier()

# I did a seperate search for n_estimators with learning_rate
# Once I found the best params, I then used different values for
# max_depth and subsample
params = [
    {
        "n_estimators": [300],
        "learning_rate": [0.01],
        "max_depth": [2, 3, 4, 5],
        "subsample": [0.8, 0.9, 1.0],
    }
]

grid_search = GridSearchCV(gb_clf, params, cv=5, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train_trans, y_train_trans)
grid_search.best_params_

In [None]:
# rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)
gb_clf = GradientBoostingClassifier(
    n_estimators=300, learning_rate=0.01, max_depth=3, subsample=1.0
)

y_train_pred = cross_val_predict(gb_clf, X_train_trans, y_train_trans, cv=5)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

confusion_matrix(y_train_trans, y_train_pred)
precision_score(y_train_trans, y_train_pred)
recall_score(y_train_trans, y_train_pred)
f1_score(y_train_trans, y_train_pred)

In [None]:
# train
gb_clf.fit(X_train_trans, y_train_trans)

In [None]:
from sklearn.metrics import accuracy_score

y_test_trans = flavor_profile_encoder.transform(y_test)
y_test_pred = gb_clf.predict(ing_encoder.transform(X_test))
confusion_matrix(y_test_trans, y_test_pred)
precision_score(y_test_trans, y_test_pred)
recall_score(y_test_trans, y_test_pred)
f1_score(y_test_trans, y_test_pred)
accuracy_score(y_test_trans, y_test_pred)

## Error analysis

In [None]:
for i in range(len(y_test)):
    if y_test_trans[i] != y_test_pred[i]:
        print(X_test[i], y_test[i], y_test_pred[i])

In [None]:
X_test_one = np.array([["kadala parupu payasam", "chana daal, yellow moong daal, jaggery, milk"]])
# y_test_one = np.array(["sweet"])

y_test_one_pred = gb_clf.predict(ing_encoder.transform(X_test_one))
flavor_profile_encoder.classes_[y_test_one_pred[0]]

In [None]:
X_test_one = np.array(
    [["", "yellow moong daal, ginger, green chillies, kala jeera, salt, sesame oil, mustard seeds"]]
)

y_test_one_pred = gb_clf.predict(ing_encoder.transform(X_test_one))
flavor_profile_encoder.classes_[y_test_one_pred[0]]