A Production ready Multi-Class Text Classifier
==

- [Reference :](https://towardsdatascience.com/a-production-ready-multi-class-text-classifier-96490408757)

In [5]:
import pandas as pd
import numpy as np
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

import joblib
pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)

#urls = 'https://github.com/dragon-library/work_space/raw/main/HS_Code/HS/hs_code.xlsx'
urls = 'data/hs_code.xlsx'
types = 'section'
#types = "chapter"

def get_master(sheets,types = 'section'):
    data = pd.read_excel(urls,sheet_name= sheets)
    data[types] = data[types].map('{:02}'.format)
    data = data[[types,'description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
     

    return data

def manage_data(df):
    df.columns = ['target', 'data']   
    
    
    return df


In [3]:
types = "section"
#types = "chapter"

print("Load the dataset: Section")
t0 = time()

sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
sheets = '2_digit'
twos = get_master(sheets,types)

sheets = 'test_01'
tests = get_master(sheets,types)

sheets = 'Declaration_2019_10'
decl = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,twos,tests,decl], ignore_index=True)
twenty_train = manage_data(data)


#twenty_test = manage_data(tests)
load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)

twenty_train.head()

Load the dataset: Section
Load dataset time:  132.963s


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [3]:
len(twenty_train)

28910

In [4]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'01': 1721,
         '02': 1471,
         '03': 314,
         '04': 1632,
         '05': 639,
         '06': 4113,
         '07': 1227,
         '08': 414,
         '09': 720,
         '10': 759,
         '11': 3838,
         '12': 360,
         '13': 785,
         '14': 285,
         '15': 2802,
         '16': 4524,
         '17': 1279,
         '18': 1145,
         '19': 89,
         '20': 763,
         '21': 29,
         '00': 1})

In [5]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [6]:
print("train test split dataset")
#train test split
df = twenty_train.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

train test split dataset


In [7]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


print("Training: ")

Training: 


In [8]:
#pipeline of feature engineering and model
t0 = time()
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [9]:
print("paramater selection")
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

paramater selection


In [10]:
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.961604980975441
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [11]:
#preparing the final pipeline using the selected parameters
print("preparing the final pipeline using the selected parameters")
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

preparing the final pipeline using the selected parameters


In [12]:
#fit model with training data
print("fit model with training data")
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

fit model with training data




train time: 208.141s


In [13]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

test time:  0.467s


In [14]:
model.classes_

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21'],
      dtype='<U2')

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  521,    1,    0,    1,    0,    3,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,  417,    0,    1,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0],
       [   0,    0,    0,   85,    0,    0,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    1,    0,    0,  464,    0,    3,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,  191,    6,    0,    0,    0,    0,
           0,    0,    1,    0,    0,    0,    0,    0,    0,    0,    0],
       [   1,    1,    3,    1,    3,    1, 1253,    5,    0,    0,    1,
           0,    0,    0,    0, 

In [16]:
accuracy_score(y_test, pred)

0.9890464660440448

In [17]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

          00       0.00      0.00      0.00         1
          01       0.99      0.99      0.99       524
          02       1.00      0.99      0.99       421
          03       0.99      0.99      0.99        86
          04       0.99      0.99      0.99       469
          05       0.96      0.98      0.97       194
          06       0.98      0.98      0.98      1277
          07       0.99      0.98      0.99       350
          08       1.00      1.00      1.00       122
          09       0.98      1.00      0.99       222
          10       1.00      1.00      1.00       217
          11       0.99      1.00      1.00      1140
          12       0.98      0.99      0.99       114
          13       0.99      0.98      0.98       241
          14       1.00      0.99      0.99        78
          15       0.99      0.99      0.99       862
          16       0.99      0.99      0.99      1351
          17       0.99    

  _warn_prf(average, modifier, msg_start, len(result))


### save the model

In [18]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model_section.pkl', compress=1)

Save Model


['model_section.pkl']

### load Model

In [6]:
import joblib
print("Load Model")
df = joblib.load('data/train/df_master_train.pkl')
print("download data...") 
df = df[[types, 'description']]   
df.columns = ['target', 'data']
n = df.isnull().sum()
print("missing values : ", n )
df.dropna(inplace=True)
df['target'] = df['target'].apply(int)
print("Load Data.. "+types+" success")
        

Load Model


FileNotFoundError: [Errno 2] No such file or directory: 'data/train/df_master_train.pkl'

In [40]:
products = input()  # Video game consoles : 20

Video game consoles


In [41]:
model.predict([products])[0]

'20'

## Chapter Model

In [19]:
#types = "section"
types = "chapter"

print("Load the dataset : chapter")
t0 = time()
sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
sheets = '2_digit'
twos = get_master(sheets,types)

sheets = 'test_01'
tests = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,twos,tests], ignore_index=True)
twenty_train = manage_data(data)


#twenty_test = manage_data(tests)

load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)

twenty_train.head()

Load the dataset : chapter
Load dataset time:  85.046s


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [20]:
len(twenty_train)

28910

In [21]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'01': 126,
         '02': 253,
         '03': 907,
         '04': 365,
         '05': 70,
         '06': 89,
         '07': 391,
         '08': 310,
         '09': 172,
         '10': 100,
         '11': 115,
         '12': 212,
         '13': 49,
         '14': 33,
         '15': 314,
         '16': 259,
         '17': 129,
         '18': 117,
         '19': 170,
         '20': 372,
         '21': 194,
         '22': 180,
         '23': 108,
         '24': 103,
         '25': 265,
         '26': 148,
         '27': 226,
         '28': 715,
         '29': 1906,
         '30': 214,
         '31': 85,
         '32': 250,
         '33': 127,
         '34': 131,
         '35': 68,
         '36': 48,
         '37': 127,
         '38': 442,
         '39': 771,
         '40': 456,
         '41': 209,
         '42': 151,
         '43': 54,
         '44': 582,
         '45': 38,
         '46': 100,
         '47': 71,
         '48': 595,
         '49': 93,
         '50': 45,
         '5

In [22]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [23]:
#train test split
print("rain test split")
df = twenty_train.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

rain test split


In [24]:
#feature engineering and model selection
print("feature engineering and model selection")
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

feature engineering and model selection


In [25]:
#pipeline of feature engineering and model
print("pipeline of feature engineering and model")
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

pipeline of feature engineering and model


In [26]:
#paramater selection
print("paramater selection")
from sklearn.model_selection import GridSearchCV

parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

paramater selection


In [27]:
print("Training: ")
   
t0 = time()
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

Training: 




0.9574195780006919
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [28]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [29]:
#fit model with training data
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)



train time: 976.911s


In [30]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)

test_time = time() - t0
print("test time:  %0.3fs" % test_time)

test time:  0.513s


In [31]:
model.classes_

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55',
       '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66',
       '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '78',
       '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89',
       '90', '91', '92', '93', '94', '95', '96', '97'], dtype='<U2')

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[ 38,   0,   1, ...,   0,   0,   0],
       [  0,  81,   0, ...,   0,   0,   0],
       [  0,   0, 277, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ..., 101,   0,   0],
       [  0,   0,   0, ...,   0,   6,   0],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=int64)

In [33]:
print(accuracy_score(y_test, pred))



0.9823590453130405


In [34]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

          01       0.95      1.00      0.97        38
          02       1.00      1.00      1.00        81
          03       1.00      0.99      1.00       279
          04       0.96      1.00      0.98       107
          05       0.94      0.89      0.92        19
          06       1.00      0.96      0.98        25
          07       0.98      0.98      0.98       126
          08       0.96      0.96      0.96        81
          09       0.92      0.94      0.93        51
          10       1.00      1.00      1.00        23
          11       0.97      0.97      0.97        36
          12       1.00      1.00      1.00        59
          13       1.00      1.00      1.00        13
          14       1.00      1.00      1.00         7
          15       0.99      0.98      0.98        86
          16       0.99      1.00      0.99        72
          17       1.00      1.00      1.00        36
          18       1.00    

  _warn_prf(average, modifier, msg_start, len(result))


### save the model

In [35]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model_chapter.pkl', compress=1)

Save Model


['model_chapter.pkl']

### load Model

In [36]:
import joblib
print("Load Model")
model = joblib.load('model_chapter.pkl')

Load Model


In [37]:
products = input()  # Video game consoles : 95

Video game consoles


In [38]:
model.predict([products])[0]

'95'