https://towardsdatascience.com/a-production-ready-multi-class-text-classifier-96490408757

In [54]:
import os
import re

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt
%matplotlib inline

In [22]:
data_path = 'data'

rows = []
for root, _, file in os.walk(data_path):
    for filename in file:
        if '.txt' in filename:
            cuisine = os.path.splitext(filename)[0]
            text_file = open(os.path.join(data_path, filename), "r")
            lines = text_file.readlines()
            for line in lines:
                row = {
                    'cuisine': cuisine,
                    'ingredients': line
                }
                rows.append(row)
            text_file.close()

df = pd.DataFrame.from_dict(rows)
df = df.sample(frac=1).reset_index(drop=True)

In [24]:
df.head()

Unnamed: 0,cuisine,ingredients
0,japanese,3 1/2 cups sweet rice flour (mochiko) 2 1/2 cu...
1,african,1 teaspoon vegetable oil 1 pound wild salmon f...
2,french,For the Poolish : 1/2 teaspoon instant yeast 1...
3,middleeastern,Marinade 1 cup plain yogurt 2 tablespoons fres...
4,mexican,3 pounds beef chuck roast 1/4 cup water 1 1/2 ...


In [25]:
df.shape

(1500, 2)

In [26]:
df.groupby('cuisine').count()

Unnamed: 0_level_0,ingredients
cuisine,Unnamed: 1_level_1
african,120
caribbean,180
chinese,180
french,180
indian,120
irish,120
italian,180
japanese,120
mexican,180
middleeastern,120


In [27]:
#pre-processing
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [28]:
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["cuisine"])

In [29]:
y.size

1500

In [30]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [36]:
#pipeline of feature engineering and model

model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))
])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

In [38]:
#parameter selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

In [39]:
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.746
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [40]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [41]:
#fit model with training data
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ..._class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))])

In [42]:
#evaluation on test data
pred = model.predict(X_test)

In [43]:
model.classes_

array(['african', 'caribbean', 'chinese', 'french', 'indian', 'irish',
       'italian', 'japanese', 'mexican', 'middleeastern'], dtype='<U13')

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[17,  0,  2,  0,  1,  0,  1,  0,  0,  8],
       [ 2, 37,  0,  3,  1,  0,  2,  0,  7,  1],
       [ 0,  1, 42,  1,  0,  0,  1,  4,  0,  0],
       [ 4,  6,  2, 35,  0,  2,  8,  4,  1,  2],
       [ 2,  2,  0,  0, 28,  0,  0,  0,  0,  4],
       [ 1,  2,  0,  4,  0, 36,  0,  1,  0,  1],
       [ 1,  0,  1,  0,  0,  0, 54,  0,  0,  1],
       [ 0,  0,  7,  1,  0,  0,  1, 18,  0,  0],
       [ 1,  0,  0,  0,  1,  0,  2,  0, 50,  0],
       [ 2,  3,  0,  3,  3,  0,  0,  0,  0, 25]])

In [45]:
accuracy_score(y_test, pred)

0.76

In [46]:
#save the model
from sklearn.externals import joblib
joblib.dump(model, 'model_cuisine_ingredients.pkl', compress=1)

['model_cuisine_ingredients.pkl']

In [47]:
from sklearn.externals import joblib
model = joblib.load('model_cuisine_ingredients.pkl')

In [48]:
test_recipe = "1 2 1/2  to 3 pound boneless pork shoulder or butt, trimmed and cut in half 1 small butternut squash (about 1 1/2 pounds)—peeled, seeded, and cut into 1 inch pieces 1 14.5 ounce can diced tomatoes 1 jalapeño pepper, seeded and chopped 2 cloves garlic, chopped 1 tablespoon chili powder kosher salt 4 6 inch corn tortillas, cut into 1/2 inch wide strips 1 tablespoon canola oil sliced radishes, cilantro sprigs, and lime wedges, for serving"

In [51]:
model.predict([test_recipe])[0]

'mexican'

In [55]:
steak_hache = "1 tbsp vegetable oil 4 shallots  , very finely chopped 600g freshly ground beef   (ask the butcher for something with roughly 15% fat - we used chuck) 8 thyme sprigs, leaves picked and chopped 2 tsp Dijon mustard 2 tbsp plain flour 200ml crème fraîche 1 egg yolk 6 tarragon   sprigs, leaves picked and finely chopped dressed green salad, to serve"

In [56]:
model.predict([steak_hache])[0]

'french'

In [57]:
toad_in_the_hole = "140g plain flour 3 eggs  300ml milk  2 tsp Dijon mustard 2 tbsp vegetable oil 8 Cumberland sausages 8 sage   leaves 4 rosemary   sprigs"

In [58]:
model.predict([toad_in_the_hole])[0]

'french'