In [3]:
import pandas as pd
import numpy as np
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

import joblib
pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)

In [4]:

#urls = 'https://github.com/dragon-library/work_space/raw/main/HS_Code/HS/hs_code.xlsx'
urls = 'data/hs_code.xlsx'
types = 'section'


def get_master(urls,sheets,types = 'section'):
    data = pd.read_excel(urls,sheet_name= sheets)
    data[types] = data[types].map('{:02}'.format)
    data = data[[types,'description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
     

    return data

def manage_data(df):
    df.columns = ['target', 'description']   
    
    
    return df

In [5]:
%%time

urls = 'data/random_test.xlsx'
types = "section"

sheets = 'Sheet1'
data = get_master(urls,sheets,types)
#data.dropna(inplace=True)
declarations = data[0:20000]
declarations 

Wall time: 1min


Unnamed: 0,section,description
0,17,"face set,movable"
1,03,"rbd palm olein,packing:5 liters/bottle(=18.240..."
2,16,fuel injection device
3,17,"body(r),front door"
4,16,"cap, wiper arm head"
...,...,...
19995,06,degreasing powder
19996,06,colgate advanced whitening toothpaste 160 g.(l...
19997,14,offspring ring 433 silver 3
19998,03,used cooking oil


In [6]:
%%time
urls = 'data/hs_code.xlsx'
types = "section"

sheets = '8_digit'
inputs_01 = get_master(urls,sheets,types)
sheets = '2_digit'
inputs_02 = get_master(urls,sheets,types)
sheets = '6_digit'
inputs_03 = get_master(urls,sheets,types)
sheets = '4_digit'
inputs_04 = get_master(urls,sheets,types)

data = pd.concat([inputs_01,inputs_02,inputs_03,inputs_04,declarations], ignore_index=True)
data = manage_data(data)

data

Wall time: 1min 27s


Unnamed: 0,target,description
0,01,"horses; live, purebred breeding animals - pure..."
1,01,"horses; live, other than purebred breeding ani..."
2,01,asses; live - other
3,01,mules and hinnies; live- other
4,01,"cattle; live, purebred breeding animals - pure..."
...,...,...
37214,06,degreasing powder
37215,06,colgate advanced whitening toothpaste 160 g.(l...
37216,14,offspring ring 433 silver 3
37217,03,used cooking oil


In [7]:
from collections import Counter

Counter(data["target"])

Counter({'01': 983,
         '02': 1053,
         '03': 265,
         '04': 2047,
         '05': 593,
         '06': 3279,
         '07': 4191,
         '08': 271,
         '09': 463,
         '10': 686,
         '11': 2834,
         '12': 276,
         '13': 772,
         '14': 637,
         '15': 4113,
         '16': 7774,
         '17': 4802,
         '18': 1039,
         '19': 56,
         '20': 1051,
         '21': 34})

In [8]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [10]:
%%time

print("train test split dataset")
#train test split
df = data.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

train test split dataset
Wall time: 3.44 s


In [11]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


print("Training: ")

#pipeline of feature engineering and model
t0 = time()
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

print("paramater selection")
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

Training: 
paramater selection




0.8489189811039631
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [12]:
#preparing the final pipeline using the selected parameters
print("preparing the final pipeline using the selected parameters")
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

#fit model with training data
print("fit model with training data")
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

#evaluation on test data
t0 = time()
pred = model.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

preparing the final pipeline using the selected parameters
fit model with training data




train time: 437.551s
test time:  0.378s


In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix(pred, y_test)

array([[ 265,    1,    0,    0,    0,    2,    1,    0,    0,    0,    0,
           0,    0,    1,    0,    1,    0,    0,    0,    1,    0],
       [   0,  306,    1,    4,    0,    1,    1,    0,    1,    0,    0,
           0,    2,    0,    4,    0,    0,    0,    0,    0,    0],
       [   0,    0,   74,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   1,    9,    0,  537,    3,   48,   29,    0,    1,    0,    6,
           3,    7,    4,   14,   14,    7,    6,    0,    9,    0],
       [   0,    0,    0,    0,  181,    1,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    1,    0],
       [   0,    0,    1,    4,    2,  893,    8,    1,    0,    1,    1,
           0,    3,    1,    6,    3,    1,    1,    0,    4,    0],
       [   0,    0,    0,    5,    5,   13, 1050,    1,    2,    7,    7,
           1,    5,    2,   41,   45,   67,    5,    0,   16,    0],

In [14]:
accuracy_score(y_test, pred)

0.8951280673473043

In [15]:
from sklearn.metrics import classification_report

print (classification_report(y_test, pred))

              precision    recall  f1-score   support

          01       0.97      1.00      0.99       266
          02       0.96      0.96      0.96       318
          03       1.00      0.95      0.97        78
          04       0.77      0.97      0.86       556
          05       0.99      0.92      0.96       196
          06       0.96      0.90      0.93       989
          07       0.83      0.83      0.83      1258
          08       0.97      0.92      0.95        78
          09       0.99      0.97      0.98       150
          10       0.94      0.92      0.93       202
          11       0.98      0.96      0.97       839
          12       0.92      0.91      0.91        85
          13       0.88      0.87      0.87       253
          14       0.96      0.94      0.95       202
          15       0.90      0.85      0.87      1272
          16       0.91      0.92      0.92      2300
          17       0.82      0.86      0.84      1453
          18       0.91    