A Production ready Multi-Class Text Classifier
==

- [Reference :](https://towardsdatascience.com/a-production-ready-multi-class-text-classifier-96490408757)

In [3]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
from time import time
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns




from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier


#urls = 'https://github.com/dragon-library/work_space/raw/main/HS_Code/HS/hs_code.xlsx'
urls = 'data/hs_code.xlsx'
types = 'section'
#types = "chapter"

def get_master(sheets,types = 'section'):
    data = pd.read_excel(urls,sheet_name= sheets)
    data[types] = data[types].map('{:02}'.format)
    data = data[[types,'description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
     

    return data

def manage_data(df):
    df.columns =  ['target', 'data'] 
    
    
    return df



In [4]:
types = "section"
#types = "chapter"

print("Load the dataset: Section")
t0 = time()

sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
sheets = '2_digit'
twos = get_master(sheets,types)

sheets = 'test_01'
tests = get_master(sheets,types)

sheets = 'Declaration_2019_10'
decl = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,twos,tests,decl], ignore_index=True)
df = manage_data(data)


#twenty_test = manage_data(tests)
load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)

df.head()

Load the dataset: Section
Load dataset time:  114.987s


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [5]:
len(df)

49137

In [6]:
df.tail(20)

Unnamed: 0,target,data
49117,15,aluminium extrusion bar ycbt604
49118,15,aluminium fitting txjbr625173
49119,15,aluminium window frame kb157112asaa
49120,16,automotive batteries wet charged model 75d23r
49121,16,a/s jdmkey fpcb_volume key_a107f_svcsm
49122,17,assy tire 11.224 rot
49123,15,aluminium extrusion bar ycrk601
49124,16,assy elementfuel
49125,15,aluminium window frame t02609abbb
49126,15,aluminium ornament for door cc0010dnje


In [7]:
#save the DataFrame
print("Save DataFrame")
import joblib
joblib.dump(df, 'train_section.pkl', compress=1)
print('success')

Save DataFrame
success


In [69]:
import joblib
print("Load DataFrame")
df = joblib.load('train_section.pkl')
df.head()

Load DataFrame


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [1]:


data_path = Path('..', 'data')
if not data_path.exists():
    data_path.mkdir(parents=True)

parquet_file = data_path / 'train.parquet'

df.to_parquet(parquet_file)
print('success')

NameError: name 'Path' is not defined

In [70]:
twenty_train = df.copy()

In [71]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'01': 126,
         '02': 253,
         '03': 907,
         '04': 365,
         '05': 70,
         '06': 89,
         '07': 391,
         '08': 310,
         '09': 172,
         '10': 106,
         '11': 117,
         '12': 217,
         '13': 52,
         '14': 33,
         '15': 327,
         '16': 265,
         '17': 141,
         '18': 133,
         '19': 204,
         '20': 432,
         '21': 259,
         '22': 402,
         '23': 247,
         '24': 125,
         '25': 268,
         '26': 156,
         '27': 267,
         '28': 751,
         '29': 1940,
         '30': 297,
         '31': 90,
         '32': 338,
         '33': 477,
         '34': 257,
         '35': 117,
         '36': 49,
         '37': 198,
         '38': 531,
         '39': 1710,
         '40': 1158,
         '41': 225,
         '42': 204,
         '43': 54,
         '44': 602,
         '45': 38,
         '46': 105,
         '47': 71,
         '48': 682,
         '49': 148,
         '50': 45,
       

In [72]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [76]:
print("train test split dataset")
#train test split
df = twenty_train.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

train test split dataset


In [77]:
X_train

['womens or girls slips and petticoats, not knitted or crocheted, of cotton',
 'woven cotton fabric, digitdigit% or more cotton by weight, plain weave, over digitdigitdigit but n/o digitdigitdigit g/mdigit, bleached, of number digitdigit or lower',
 'regulating or controlling instruments and apparatus; automatic, parts and accessories - of goods of subheading digitdigitdigitdigit.digitdigit.digitdigit',
 'absorber asm rr shk',
 'fish; edible offal, other than shark fins, fish heads, tails and maws',
 'automotive disc brake pads digitdigitdigitdigitdigitdigitdigitdigitdigit set tdbdigitdigitdigitdigit metab',
 'aluminium fabrication bar ejhagzdigitdigitdigitdigit',
 'stationery; letter clips, letter corners, paper clips, indexing tags and similar office articles, including parts, of base metal - other',
 'adigitdigitdigitdigitdigitdigitdigitdigit floor nozzle led lights',
 'aquatic plants',
 'accustar life bnkdigitdigitdigitdigitdigitdigitdigitdigitdigit digitdigitdigitdigitdigitdigitdi

In [75]:
y_train

array(['28', '73', '87', ..., '29', '16', '39'], dtype=object)

In [7]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


print("Training: ")

Training: 


In [8]:
#pipeline of feature engineering and model
t0 = time()
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [9]:
print("paramater selection")
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

paramater selection


In [10]:
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.8787852808408421
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [11]:
#preparing the final pipeline using the selected parameters
print("preparing the final pipeline using the selected parameters")
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

preparing the final pipeline using the selected parameters


In [12]:
#fit model with training data
print("fit model with training data")
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

fit model with training data




train time: 1009.945s


In [13]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

test time:  1.653s


In [14]:
model.classes_

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21'],
      dtype='<U2')

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[ 519,    0,    0,    1,    0,    3,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  463,    0,    5,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    1,    0,    0,    0],
       [   0,    0,   93,    0,    0,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,  593,    0,   11,    3,    0,    1,    1,    1,
           0,    0,    1,    6,    2,    1,    3,    0,    2,    0],
       [   0,    0,    0,    1,  195,    3,    1,    0,    0,    0,    0,
           0,    1,    0,    1,    0,    1,    0,    0,    1,    0],
       [   1,    1,    1,   15,    6, 1469,    9,    0,    0,    3,    3,
           1,    0,    2,    4,   16,    3,    3,    0,    4,    0],
       [   0,    0,    0,    2,    0,    6,  689,    1,    0,    0,    3,
           0,    2,    1,    8,   21,   43,    4,    0,    8,    0],

In [16]:
accuracy_score(y_test, pred)

0.9310812644145977

In [17]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

          01       0.99      0.99      0.99       523
          02       0.99      0.99      0.99       467
          03       0.98      0.99      0.98        94
          04       0.95      0.95      0.95       624
          05       0.96      0.96      0.96       204
          06       0.95      0.94      0.95      1559
          07       0.87      0.81      0.84       849
          08       0.93      0.95      0.94       148
          09       0.96      0.98      0.97       224
          10       0.96      0.94      0.95       279
          11       0.99      0.98      0.98      1237
          12       0.98      0.92      0.95       135
          13       0.91      0.95      0.93       291
          14       0.89      0.89      0.89       185
          15       0.97      0.95      0.96      3121
          16       0.88      0.93      0.91      2525
          17       0.85      0.89      0.87      1481
          18       0.93    

### save the model

In [18]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model_section.pkl', compress=1)

Save Model


['model_section.pkl']

### load Model

In [39]:
import joblib
print("Load Model")
model = joblib.load('model_section.pkl')

Load Model


In [40]:
products = input()  # Video game consoles : 20

Video game consoles


In [41]:
model.predict([products])[0]

'20'

## Chapter Model

In [12]:
#types = "section"
types = "chapter"

print("Load the dataset : chapter")
t0 = time()


sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
sheets = '2_digit'
twos = get_master(sheets,types)

sheets = 'test_01'
tests = get_master(sheets,types)

sheets = 'Declaration_2019_10'
decl = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,twos,tests,decl], ignore_index=True)
twenty_train = manage_data(data)



#twenty_test = manage_data(tests)

load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)
print(len(twenty_train))
twenty_train.head()

Load the dataset : chapter
Load dataset time:  116.764s
49137


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [13]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'01': 126,
         '02': 253,
         '03': 907,
         '04': 365,
         '05': 70,
         '06': 89,
         '07': 391,
         '08': 310,
         '09': 172,
         '10': 106,
         '11': 117,
         '12': 217,
         '13': 52,
         '14': 33,
         '15': 327,
         '16': 265,
         '17': 141,
         '18': 133,
         '19': 204,
         '20': 432,
         '21': 259,
         '22': 402,
         '23': 247,
         '24': 125,
         '25': 268,
         '26': 156,
         '27': 267,
         '28': 751,
         '29': 1940,
         '30': 297,
         '31': 90,
         '32': 338,
         '33': 477,
         '34': 257,
         '35': 117,
         '36': 49,
         '37': 198,
         '38': 531,
         '39': 1710,
         '40': 1158,
         '41': 225,
         '42': 204,
         '43': 54,
         '44': 602,
         '45': 38,
         '46': 105,
         '47': 71,
         '48': 682,
         '49': 148,
         '50': 45,
       

In [14]:
#save the DataFrame
print("Save DataFrame")
import joblib
joblib.dump(twenty_train, 'train_chapter.pkl', compress=1)
print('success')

Save DataFrame
success


In [5]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [6]:
#train test split
print("rain test split")
df = twenty_train.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

rain test split


In [7]:
#feature engineering and model selection
print("feature engineering and model selection")
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

feature engineering and model selection


In [8]:
#pipeline of feature engineering and model
print("pipeline of feature engineering and model")
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

pipeline of feature engineering and model


In [9]:
#paramater selection
print("paramater selection")
from sklearn.model_selection import GridSearchCV

parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

paramater selection


In [10]:
print("Training: ")
   
t0 = time()
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

Training: 




0.8671036298745671
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [11]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [12]:
#fit model with training data
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)



train time: 3068.481s


In [13]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)

test_time = time() - t0
print("test time:  %0.3fs" % test_time)

test time:  0.684s


In [14]:
model.classes_

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55',
       '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66',
       '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '78',
       '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89',
       '90', '91', '92', '93', '94', '95', '96', '97'], dtype='<U2')

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[ 37,   0,   2, ...,   0,   0,   0],
       [  0,  72,   0, ...,   0,   0,   0],
       [  0,   0, 280, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,  49,   0,   0],
       [  0,   0,   0, ...,   0, 103,   0],
       [  0,   0,   0, ...,   0,   0,  10]], dtype=int64)

In [16]:
print(accuracy_score(y_test, pred))



0.9134445801112467


In [17]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

          01       0.95      1.00      0.97        37
          02       1.00      1.00      1.00        72
          03       0.99      0.99      0.99       282
          04       0.97      0.99      0.98       115
          05       1.00      0.88      0.94        17
          06       1.00      1.00      1.00        25
          07       0.97      0.97      0.97       129
          08       0.99      0.94      0.96        95
          09       0.95      1.00      0.98        59
          10       1.00      0.94      0.97        31
          11       0.89      1.00      0.94        32
          12       0.99      0.97      0.98        69
          13       0.89      0.94      0.92        18
          14       0.90      1.00      0.95         9
          15       0.98      1.00      0.99        94
          16       1.00      0.97      0.99        71
          17       0.92      0.89      0.91        38
          18       0.95    

### save the model

In [18]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model_chapter.pkl', compress=1)

Save Model


['model_chapter.pkl']

### load Model

In [19]:
import joblib
print("Load Model")
model = joblib.load('model_chapter.pkl')

Load Model


In [21]:
products = input()  # Video game consoles : 95

Video game consoles


In [22]:
model.predict([products])[0]

'95'