In [1]:
import pandas as pd
import numpy as np
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

import joblib
pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)

def save_df(df,files = 'data/temp.pkl'):
    print("Save DataFrame...")

    joblib.dump(df,files, compress=1)
    
    print('success')

def load_df(files = 'data/temp.pkl'):
    print("Load DataFrame...")

    df = joblib.load(files)
    
    print('Load data success')
    return df

In [2]:
files =  'data/train/df_master_section.pkl'
df = load_df(files)
df

Load DataFrame...
Load data success


Unnamed: 0,description,section,chapter,heading,sub-heading,tariff
0,"horses; live, purebred breeding animals - pure...",01,01,0101,010121,01012100
1,"horses; live, other than purebred breeding ani...",01,01,0101,010129,01012900
2,asses; live - other,01,01,0101,010130,01013090
3,mules and hinnies; live- other,01,01,0101,010190,01019000
4,"cattle; live, purebred breeding animals - pure...",01,01,0102,010221,01022100
...,...,...,...,...,...,...
49131,adjust gear set jfkz658b,15,83,8302,830241,83024190
49132,aluminium window frame kf057072avat,15,76,7610,761010,76101090
49133,acrylic plate 4mm. size24x24 cm.,07,39,3926,392690,39269099
49134,ac power cord p/n 141102240p6,16,85,8544,854411,85441190


In [3]:
df = df[['section', 'description']]
df.columns = ['target', 'data']
df

Unnamed: 0,target,data
0,01,"horses; live, purebred breeding animals - pure..."
1,01,"horses; live, other than purebred breeding ani..."
2,01,asses; live - other
3,01,mules and hinnies; live- other
4,01,"cattle; live, purebred breeding animals - pure..."
...,...,...
49131,15,adjust gear set jfkz658b
49132,15,aluminium window frame kf057072avat
49133,07,acrylic plate 4mm. size24x24 cm.
49134,16,ac power cord p/n 141102240p6


In [5]:
files =  'data/declaration.pkl'
decl = load_df(files)
decl['target'] = decl['target'].map('{:02}'.format)
decl['data'] = decl['data'].str.lower()
decl

Load DataFrame...
Load data success


Unnamed: 0,target,data
0,04,meal
1,04,miscellaneous
2,04,meal
3,04,miscellaneous
4,07,radial tire 4011100000
...,...,...
599761,17,"pipe sub-assy, nozzle leakage"
599762,17,"rail assy, common"
599763,17,"pipe, fuel, no.1"
599764,17,"pipe, fuel, no.4"


In [6]:
df = pd.concat([df,decl], ignore_index=True)
df

Unnamed: 0,target,data
0,01,"horses; live, purebred breeding animals - pure..."
1,01,"horses; live, other than purebred breeding ani..."
2,01,asses; live - other
3,01,mules and hinnies; live- other
4,01,"cattle; live, purebred breeding animals - pure..."
...,...,...
641642,17,"pipe sub-assy, nozzle leakage"
641643,17,"rail assy, common"
641644,17,"pipe, fuel, no.1"
641645,17,"pipe, fuel, no.4"


In [7]:
from collections import Counter

Counter(df["target"])

Counter({'01': 1270,
         '02': 4743,
         '03': 760,
         '04': 27267,
         '05': 4257,
         '06': 36916,
         '07': 92922,
         '08': 5227,
         '09': 2602,
         '10': 8611,
         '11': 26891,
         '12': 4463,
         '13': 11713,
         '14': 10121,
         '15': 96359,
         '16': 171928,
         '17': 90593,
         '18': 24319,
         '19': 176,
         '20': 20313,
         '21': 196})

In [9]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()


print("train test split dataset")
#train test split

from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append((df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)


#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


print("Training: ")

#pipeline of feature engineering and model
t0 = time()
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])


print("paramater selection")
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}


gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)




train test split dataset
Training: 
paramater selection


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
#preparing the final pipeline using the selected parameters
print("preparing the final pipeline using the selected parameters")
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])


#fit model with training data
print("fit model with training data")
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

#evaluation on test data
t0 = time()
pred = model.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

In [None]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'data/model_section.pkl', compress=1)