In [None]:
import functools
import json
import shutil
import tempfile
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import (train_test_split,
                                     GridSearchCV, StratifiedShuffleSplit)
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder

In [None]:
def load_json(filepath, **kwargs):
    with open(filepath) as fh:
        return json.load(fh, **kwargs)


def load_dataframe(deserialized, is_test=False):
    ingredients = [" ".join(map(lambda s: s.replace(" ", "_"),
                                entry["ingredients"]))
                   for entry in deserialized]
    index = [entry["id"] for entry in deserialized]
    data = pd.Series(data=ingredients, index=index)
    if not is_test:
        target = pd.Series([entry["cuisine"] for entry in deserialized])
        return data, target
    else:
        return data

In [None]:
deserialized_train = load_json("../input/train.json")
deserialized_train[-1]

In [None]:
data_train, target_train = load_dataframe(deserialized_train)
data_train[:3]

In [None]:
target_train.unique()

In [None]:
value_counts = target_train.value_counts()

plt.figure(figsize=(20, 7))
plt.bar(range(value_counts.shape[0]), value_counts,
        color="SkyBlue")
plt.xticks(range(value_counts.shape[0]), value_counts.index,
           rotation=60, ha="right")
plt.xlabel("Labels")
plt.ylabel("Total")
plt.show()

In [None]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(data_train)
bag_of_words

In [None]:
encoder = LabelEncoder()
target_train_encoded = encoder.fit_transform(target_train)

x_train, x_test, y_train, y_test = train_test_split(bag_of_words,
                                                    target_train_encoded,
                                                    random_state=0)

estimator = PassiveAggressiveClassifier(early_stopping=True, tol=1e-3,
                                        loss="hinge", average=True,
                                        class_weight="balanced", n_jobs=-1,
                                        verbose=False, random_state=0)
estimator.fit(x_train, y_train)
print(f"Score: {estimator.score(x_test, y_test):.3f}")

In [None]:
def visualize_coefficients(feature_names, estimator_coefficients,
                           class_=0, n_top=20, **kwargs):
    coefficients = np.ravel(estimator_coefficients[class_])
    positive_coef = np.argsort(coefficients)[-n_top:]
    negative_coef = np.argsort(coefficients)[:n_top]
    coef_matrix = np.hstack([negative_coef, positive_coef])

    plt.figure(figsize=(22, 6))
    plt.bar(np.arange(2 * n_top), coefficients[coef_matrix],
            color=["b" if c < 0 else "r" for c in coefficients[coef_matrix]])
    plt.xticks(np.arange(2 * n_top), feature_names[coef_matrix],
               rotation=60, ha="right")
    plt.subplots_adjust(bottom=0.3)
    plt.xlabel("Feature values")
    plt.ylabel("Coefficient magnitude")
    plt.title(kwargs.get("target_value"))
    plt.show()


feature_names = np.array(vectorizer.get_feature_names())
coefficients = estimator.coef_
visualize_coefficients_default = functools.partial(visualize_coefficients,
                                                   feature_names, coefficients)

In [None]:
label = 15
label_encoded = encoder.inverse_transform([label])[0]

visualize_coefficients_default(class_=label, n_top=15, target_value=label_encoded)

In [None]:
cachedir = tempfile.mkdtemp()

pipe = Pipeline([("tfidf", TfidfVectorizer()),
                 ("tsvd", TruncatedSVD(random_state=0)),
                 ("pa", PassiveAggressiveClassifier(fit_intercept=False,
                                                    max_iter=10000,
                                                    early_stopping=True,
                                                    validation_fraction=.3,
                                                    n_jobs=-1,
                                                    random_state=0,
                                                    class_weight="balanced",
                                                    average=True))],
                memory=cachedir)

param_grid = {"tfidf__min_df": [1, 5],
              "tsvd__n_components": [100, 500, 1000],
              "pa__C": [1e-4, 1e-2, 1., 100],
              "pa__tol": 10. ** -np.arange(4, 7),
              "pa__loss": ["hinge", "squared_hinge"]}

cv = StratifiedShuffleSplit(n_splits=5, test_size=.3, random_state=0)

grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=cv,
                    scoring="f1_weighted", n_jobs=-1, verbose=True)
# grid.fit(data_train, target_train)
# Fitting 5 folds for each of 144 candidates, totalling 720 fits
# [Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.9min
# [Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 29.2min
# [Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 60.3min
# [Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 95.3min finished

shutil.rmtree(cachedir)

# print(f"Best cross-validation score: {grid.best_score_:.3f}")
# Best cross-validation score: 0.742

# print(f"Best estimator:\n{grid.best_estimator_}")
# TfidfVectorizer(min_df=1)
# TruncatedSVD(n_components=1000, algorithm="randomized", random_state=0)
# PassiveAggressiveClassifier(C=1, tol=1e-5, loss="hinge",
#                             average=True, early_stopping=True,
#                             max_iter=10000, class_weight="balanced")

In [None]:
start_time = time.time()

best_pipe = make_pipeline(TfidfVectorizer(),
                          TruncatedSVD(n_components=1000, random_state=0),
                          PassiveAggressiveClassifier(C=1.,
                                                      tol=1e-5,
                                                      loss="hinge",
                                                      fit_intercept=False,
                                                      max_iter=10000,
                                                      early_stopping=True,
                                                      validation_fraction=.3,
                                                      n_jobs=-1,
                                                      random_state=0,
                                                      class_weight="balanced",
                                                      average=True))
best_pipe.fit(data_train, target_train)

duration = time.time() - start_time
print(f"Done in {duration:.2f} sec.")

In [None]:
data_test = load_dataframe(load_json("../input/test.json"), is_test=True)
data_test.head()

In [None]:
target_test = best_pipe.predict(data_test)

submission = pd.DataFrame(data={"id": data_test.index, "cuisine": target_test})
submission.to_csv("submission.csv", index=None)
submission.head(n=15).T