A Production ready Multi-Class Text Classifier
==

- [Reference :](https://towardsdatascience.com/a-production-ready-multi-class-text-classifier-96490408757)

In [3]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
from time import time
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import joblib



from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier


#urls = 'https://github.com/dragon-library/work_space/raw/main/HS_Code/HS/hs_code.xlsx'
urls = 'data/hs_code.xlsx'
types = 'heading'
#types = "chapter"

def get_master(sheets,types = 'section'):
    data = pd.read_excel(urls,sheet_name= sheets)
    #data[types] = data[types].map('{:04}'.format)
    data = data[[types,'description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
     

    return data

def manage_data(df):
    df.columns =  ['target', 'data'] 
    
    
    return df



In [4]:
types = "heading"
#types = "chapter"

print("Load the dataset: Section")
t0 = time()

sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
#sheets = '2_digit'
#twos = get_master(sheets,types)

sheets = 'test_01'
tests = get_master(sheets,types)

sheets = 'Declaration_2019_10'
decl = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,tests,decl], ignore_index=True)
df = manage_data(data)


#twenty_test = manage_data(tests)
load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)

df.head()

Load the dataset: Section
Load dataset time:  90.864s


Unnamed: 0,target,data
0,101,"horses; live, purebred breeding animals - pure..."
1,101,"horses; live, other than purebred breeding ani..."
2,101,asses; live - other
3,101,mules and hinnies; live- other
4,102,"cattle; live, purebred breeding animals - pure..."


In [5]:
len(df)

49041

In [6]:
df.tail(20)

Unnamed: 0,target,data
49021,7604,aluminium extrusion bar ycbt604
49022,7610,aluminium fitting txjbr625173
49023,7610,aluminium window frame kb157112asaa
49024,8506,automotive batteries wet charged model 75d23r
49025,8517,a/s jdmkey fpcb_volume key_a107f_svcsm
49026,8708,assy tire 11.224 rot
49027,7604,aluminium extrusion bar ycrk601
49028,8421,assy elementfuel
49029,7610,aluminium window frame t02609abbb
49030,7610,aluminium ornament for door cc0010dnje


In [35]:
df['target'] = df['target'].apply(str)
df

Unnamed: 0,target,data
0,0101,"horses; live, purebred breeding animals - pure..."
1,0101,"horses; live, other than purebred breeding ani..."
2,0101,asses; live - other
3,0101,mules and hinnies; live- other
4,0102,"cattle; live, purebred breeding animals - pure..."
...,...,...
49036,8302,adjust gear set jfkz658b
49037,7610,aluminium window frame kf057072avat
49038,3926,acrylic plate 4mm. size24x24 cm.
49039,8544,ac power cord p/n 141102240p6


In [36]:
#save the DataFrame
print("Save DataFrame")
import joblib
joblib.dump(df, 'train_4_digit.pkl', compress=1)
print('success')

Save DataFrame
success


In [69]:
import joblib
print("Load DataFrame")
df = joblib.load("train_4_digit.pkl")
df.head()

Load DataFrame


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [37]:
twenty_train = df.copy()

In [38]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'0101': 9,
         '0102': 14,
         '0103': 7,
         '0104': 7,
         '0105': 24,
         '0106': 27,
         '0201': 7,
         '0202': 7,
         '0203': 13,
         '0204': 19,
         '0206': 19,
         '0207': 41,
         '0208': 10,
         '0209': 5,
         '0210': 20,
         '0301': 39,
         '0302': 104,
         '0303': 105,
         '0304': 95,
         '0305': 59,
         '0306': 82,
         '0307': 82,
         '0308': 29,
         '0401': 14,
         '0402': 20,
         '0403': 9,
         '0404': 6,
         '0405': 10,
         '0406': 13,
         '0407': 18,
         '0408': 9,
         '0409': 3,
         '0410': 4,
         '0501': 3,
         '0502': 5,
         '0504': 3,
         '0505': 7,
         '0506': 5,
         '0507': 5,
         '0508': 4,
         '0510': 3,
         '0511': 13,
         '0601': 7,
         '0602': 18,
         '0603': 15,
         '0604': 7,
         '0701': 6,
         '0702': 3,
         '070

In [39]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    string = re.sub(r"<", "", string)  
    return string.strip().lower()

In [40]:
print("train test split dataset")
#train test split
df = twenty_train.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

train test split dataset


In [43]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


print("Training: ")

Training: 


In [44]:
#pipeline of feature engineering and model
t0 = time()
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [45]:
print("paramater selection")
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

paramater selection


In [46]:
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

KeyboardInterrupt: 

In [None]:
#preparing the final pipeline using the selected parameters
print("preparing the final pipeline using the selected parameters")
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [None]:
#fit model with training data
print("fit model with training data")
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

In [None]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

In [None]:
model.classes_

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

### save the model

In [18]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model_4_digit.pkl', compress=1)

Save Model


['model_section.pkl']

### load Model

In [39]:
import joblib
print("Load Model")
model = joblib.load('model_section.pkl')

Load Model


In [40]:
products = input()  # Video game consoles : 20

Video game consoles


In [41]:
model.predict([products])[0]

'20'

## Chapter Model

In [12]:
#types = "section"
types = "chapter"

print("Load the dataset : chapter")
t0 = time()


sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
sheets = '2_digit'
twos = get_master(sheets,types)

sheets = 'test_01'
tests = get_master(sheets,types)

sheets = 'Declaration_2019_10'
decl = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,twos,tests,decl], ignore_index=True)
twenty_train = manage_data(data)



#twenty_test = manage_data(tests)

load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)
print(len(twenty_train))
twenty_train.head()

Load the dataset : chapter
Load dataset time:  116.764s
49137


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [13]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'01': 126,
         '02': 253,
         '03': 907,
         '04': 365,
         '05': 70,
         '06': 89,
         '07': 391,
         '08': 310,
         '09': 172,
         '10': 106,
         '11': 117,
         '12': 217,
         '13': 52,
         '14': 33,
         '15': 327,
         '16': 265,
         '17': 141,
         '18': 133,
         '19': 204,
         '20': 432,
         '21': 259,
         '22': 402,
         '23': 247,
         '24': 125,
         '25': 268,
         '26': 156,
         '27': 267,
         '28': 751,
         '29': 1940,
         '30': 297,
         '31': 90,
         '32': 338,
         '33': 477,
         '34': 257,
         '35': 117,
         '36': 49,
         '37': 198,
         '38': 531,
         '39': 1710,
         '40': 1158,
         '41': 225,
         '42': 204,
         '43': 54,
         '44': 602,
         '45': 38,
         '46': 105,
         '47': 71,
         '48': 682,
         '49': 148,
         '50': 45,
       

In [14]:
#save the DataFrame
print("Save DataFrame")
import joblib
joblib.dump(twenty_train, 'train_chapter.pkl', compress=1)
print('success')

Save DataFrame
success


In [5]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [6]:
#train test split
print("rain test split")
df = twenty_train.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

rain test split


In [7]:
#feature engineering and model selection
print("feature engineering and model selection")
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

feature engineering and model selection


In [8]:
#pipeline of feature engineering and model
print("pipeline of feature engineering and model")
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

pipeline of feature engineering and model


In [9]:
#paramater selection
print("paramater selection")
from sklearn.model_selection import GridSearchCV

parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

paramater selection


In [10]:
print("Training: ")
   
t0 = time()
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

Training: 




0.8671036298745671
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [11]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [12]:
#fit model with training data
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)



train time: 3068.481s


In [13]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)

test_time = time() - t0
print("test time:  %0.3fs" % test_time)

test time:  0.684s


In [14]:
model.classes_

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55',
       '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66',
       '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '78',
       '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89',
       '90', '91', '92', '93', '94', '95', '96', '97'], dtype='<U2')

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[ 37,   0,   2, ...,   0,   0,   0],
       [  0,  72,   0, ...,   0,   0,   0],
       [  0,   0, 280, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,  49,   0,   0],
       [  0,   0,   0, ...,   0, 103,   0],
       [  0,   0,   0, ...,   0,   0,  10]], dtype=int64)

In [16]:
print(accuracy_score(y_test, pred))



0.9134445801112467


In [17]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

          01       0.95      1.00      0.97        37
          02       1.00      1.00      1.00        72
          03       0.99      0.99      0.99       282
          04       0.97      0.99      0.98       115
          05       1.00      0.88      0.94        17
          06       1.00      1.00      1.00        25
          07       0.97      0.97      0.97       129
          08       0.99      0.94      0.96        95
          09       0.95      1.00      0.98        59
          10       1.00      0.94      0.97        31
          11       0.89      1.00      0.94        32
          12       0.99      0.97      0.98        69
          13       0.89      0.94      0.92        18
          14       0.90      1.00      0.95         9
          15       0.98      1.00      0.99        94
          16       1.00      0.97      0.99        71
          17       0.92      0.89      0.91        38
          18       0.95    

### save the model

In [18]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model_chapter.pkl', compress=1)

Save Model


['model_chapter.pkl']

### load Model

In [19]:
import joblib
print("Load Model")
model = joblib.load('model_chapter.pkl')

Load Model


In [21]:
products = input()  # Video game consoles : 95

Video game consoles


In [22]:
model.predict([products])[0]

'95'