Βασικά imports

In [1]:
!pip install -r requirements.txt

import re

import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer

# Scikit Imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Save model checkpoints
from joblib import dump

np.random.seed(42)



Load json data

In [2]:
train_df = pd.read_json('data/train.json',  dtype={'id': 'int64', 'cuisine': 'str', 'ingredients': 'str'})
train_df = train_df[train_df.ingredients.str.len() > 1]

print(train_df.shape)

(39774, 3)


Convert to lowercase, remove punctuation

In [3]:
# Convert to lowercase
train_df['seperated_ingredients'] = train_df.ingredients.str.lower()

# Remove punctuation
reg = re.compile('[^\w\s\,]')
train_df.seperated_ingredients.replace(reg, '', inplace=True)

print(train_df.shape)

(39774, 4)


In [4]:
train_df.head()

Unnamed: 0,id,cuisine,ingredients,seperated_ingredients
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom...","romaine lettuce, black olives, grape tomatoes,..."
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma...","plain flour, ground pepper, salt, tomatoes, gr..."
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...","eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']","water, vegetable oil, wheat, salt"
4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay...","black pepper, shallots, cornflour, cayenne pep..."


Lemmatize

In [5]:
lemmatizer = WordNetLemmatizer()

train_df['seperated_ingredients'] = train_df.seperated_ingredients.apply(lambda x: ''.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
print(train_df.shape)

(39774, 4)


In [6]:
train_df.head()

Unnamed: 0,id,cuisine,ingredients,seperated_ingredients
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom...","romainelettuce,blackolive,grapetomato,garlic,p..."
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma...","plainflour,groundpepper,salt,tomato,groundblac..."
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...","egg,pepper,salt,mayonaise,cookingoil,greenchil..."
3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']","water,vegetableoil,wheat,salt"
4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay...","blackpepper,shallot,cornflour,cayennepepper,on..."


In [7]:
uniq = train_df.cuisine.unique()

print(f'{len(uniq)} unique labels: {uniq}')

20 unique labels: ['greek' 'southern_us' 'filipino' 'indian' 'jamaican' 'spanish' 'italian'
 'mexican' 'chinese' 'british' 'thai' 'vietnamese' 'cajun_creole'
 'brazilian' 'french' 'japanese' 'irish' 'korean' 'moroccan' 'russian']


## Gridsearch using Stochastic Gradient Descent

In [8]:
df = train_df.copy()

# Split Dataset into test and train, using 80-20 ratio.
X_train, X_test, y_train, y_test = train_test_split(df.ingredients, df.cuisine, test_size=0.2, random_state=42)


def tokenizer(text):
    return nltk.tokenize.casual.TweetTokenizer().tokenize(text)


# Construct a pipeline in order to use vectorizer => transformer => classifier easier.
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenizer, max_features=None,
                             encoding='utf-8', lowercase=False)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', random_state=42, penalty='elasticnet')),
])

# Tuning parameters. Change loss to use other classifiers.
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (1, 5, 10),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.0001, 0.00001),
    'clf__penalty': ('l1', 'l2', 'elasticnet'),
    # 'clf__loss': ('log', 'modified_huber', 'epsilon_insensitive', 'perceptron')
}


if __name__ == '__main__':
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

    print(f'Training ...')

    grid_search.fit(X_train, y_train)
    print(f'\nK-fold Score: {grid_search.best_score_:.2f}.')

    best_parameters = grid_search.best_estimator_.get_params()
    print('Best Parameters:\n')
    for param_name in sorted(parameters.keys()):
        print(f'{param_name}: {best_parameters[param_name]}')

    # dump(grid_search.best_estimator_, '../trained_pipeline.pkl', compress=0)

    acc = (grid_search.predict(X_test) == y_test).mean()
    print(f'Accuracy: {acc*100:0.2f} %.')

Training ...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 10.8min finished



K-fold Score: 0.79.
Best Parameters:

clf__alpha: 1e-05
clf__penalty: elasticnet
tfidf__norm: l2
vect__max_df: 1.0
vect__min_df: 1
vect__ngram_range: (1, 2)
Accuracy: 78.57 %.
