In [1]:
import pandas as pd
import numpy as np
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

import joblib
pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)


def load_excel(files,sheets):
    '''
    # example
    
    files = "data/master_declaration.xlsx"
    sheets = 'deci'
'''
    df = pd.read_excel(files, sheet_name=sheets)
    
    return df

def print_df_info(df):
    print("Original Dataframe:")
    print(df.shape)
    print("\nDuplicate rows of DataFrame:")
    print(df.duplicated().sum())
    print("\nNaN values of DataFrame")
    print(df.isnull().sum())

def drop_df(df,subsets=None, keeps='last'):
    
    data = df.drop_duplicates(subset=subsets, keep=keeps)
    
    return data

def drop_nan(df,axiss=0):
    df = df.dropna(axis=axiss) 
    
    return df

def save_df(df,files = 'data/temp.pkl'):
    print("Save DataFrame...")

    joblib.dump(df,files, compress=1)
    
    print('success')

def load_df(files = 'data/temp.pkl'):
    print("Load DataFrame...")

    df = joblib.load(files)
    
    print('Load data success')
    return df


In [2]:
files =  'data/train/df_master_section.pkl'

master = load_df(files)
master

Load DataFrame...
Load data success


Unnamed: 0,description,section,chapter,heading,sub-heading,tariff
0,"horses; live, purebred breeding animals - pure...",01,01,0101,010121,01012100
1,"horses; live, other than purebred breeding ani...",01,01,0101,010129,01012900
2,asses; live - other,01,01,0101,010130,01013090
3,mules and hinnies; live- other,01,01,0101,010190,01019000
4,"cattle; live, purebred breeding animals - pure...",01,01,0102,010221,01022100
...,...,...,...,...,...,...
49131,adjust gear set jfkz658b,15,83,8302,830241,83024190
49132,aluminium window frame kf057072avat,15,76,7610,761010,76101090
49133,acrylic plate 4mm. size24x24 cm.,07,39,3926,392690,39269099
49134,ac power cord p/n 141102240p6,16,85,8544,854411,85441190


In [3]:
data = master.copy()
print_df_info(data)

Original Dataframe:
(49134, 6)

Duplicate rows of DataFrame:
579

NaN values of DataFrame
description       0
section           0
chapter           0
heading          96
sub-heading    1317
tariff         6674
dtype: int64


In [6]:
data = drop_df(data)
data = drop_nan(data)
data.shape
data

Unnamed: 0,description,section,chapter,heading,sub-heading,tariff
0,"horses; live, purebred breeding animals - pure...",01,01,0101,010121,01012100
1,"horses; live, other than purebred breeding ani...",01,01,0101,010129,01012900
2,asses; live - other,01,01,0101,010130,01013090
3,mules and hinnies; live- other,01,01,0101,010190,01019000
4,"cattle; live, purebred breeding animals - pure...",01,01,0102,010221,01022100
...,...,...,...,...,...,...
49131,adjust gear set jfkz658b,15,83,8302,830241,83024190
49132,aluminium window frame kf057072avat,15,76,7610,761010,76101090
49133,acrylic plate 4mm. size24x24 cm.,07,39,3926,392690,39269099
49134,ac power cord p/n 141102240p6,16,85,8544,854411,85441190


In [None]:
data = data[['section', 'description']]
data.columns = ['target', 'data']

In [6]:
%%time
files = "data/declaration_2020.xlsx"
sheets = 'decx'

df = load_excel(files,sheets)
df

Wall time: 2min 23s


Unnamed: 0,no.,section,chapter,heading,sub-heading,tariff,description,description_th
0,1,4,21,2106,210690,21069099,MEAL,อาหารสำเร็จรูป
1,2,4,22,2202,220299,22029990,MISCELLANEOUS,เครื่องดื่มและอื่นๆ
2,3,4,21,2106,210690,21069099,MEAL,อาหารสำเร็จรูป
3,4,4,22,2202,220299,22029990,MISCELLANEOUS,เครื่องดื่มและอื่นๆ
4,5,7,40,4011,401110,40111000,RADIAL TIRE 4011100000,ยางเรเดียล
...,...,...,...,...,...,...,...,...
330661,330662,17,87,8708,870899,87089980,"BRACKET,RR BUMPER SIDE,LH",ขายึดมุมกันชนหลัง ซ้าย
330662,330663,15,83,8302,830230,83023090,"LATCH,RR BODY RR GATE,RH",กลอนฝากระบะท้าย
330663,330664,17,87,8708,870821,87082100,"COVER,2ND SEAT BELT SASH GUIDE",ฝาครอบน๊อต
330664,330665,17,87,8708,870829,87082920,"SEAT BELT,RR SEAT,INR LH",ตัวล็อคเข็มขัดนิรภัย


In [7]:
df = df[['section', 'description']]
df.columns = ['target', 'data']
df

Unnamed: 0,target,data
0,4,MEAL
1,4,MISCELLANEOUS
2,4,MEAL
3,4,MISCELLANEOUS
4,7,RADIAL TIRE 4011100000
...,...,...
330661,17,"BRACKET,RR BUMPER SIDE,LH"
330662,15,"LATCH,RR BODY RR GATE,RH"
330663,17,"COVER,2ND SEAT BELT SASH GUIDE"
330664,17,"SEAT BELT,RR SEAT,INR LH"


In [4]:
df

Unnamed: 0,target,data
0,16,APPLE TV HD (32GB)-THA A1625
1,16,APPLE TV 4K (64GB)-THA A1842
2,15,MGLP-B8-6 TUBULAR & BREAK MANDREL BLIND RIVETA...
3,5,SUPER CEMENT 40 KG. IN PRE-SLING
4,5,TIGER PLASTERING CEMENT IN PRE-SLING
...,...,...
269095,17,"PIPE SUB-ASSY, NOZZLE LEAKAGE"
269096,17,"RAIL ASSY, COMMON"
269097,17,"PIPE, FUEL, NO.1"
269098,17,"PIPE, FUEL, NO.4"


In [5]:
files =  'data/temp.pkl'

save_df(df,files)

Save DataFrame...
success


In [8]:
files =  'data/temp.pkl'
deci = load_df(files)

Load DataFrame...
Load data success


In [9]:
data = pd.concat([df,deci], ignore_index=True)
data

Unnamed: 0,target,data
0,4,MEAL
1,4,MISCELLANEOUS
2,4,MEAL
3,4,MISCELLANEOUS
4,7,RADIAL TIRE 4011100000
...,...,...
599761,17,"PIPE SUB-ASSY, NOZZLE LEAKAGE"
599762,17,"RAIL ASSY, COMMON"
599763,17,"PIPE, FUEL, NO.1"
599764,17,"PIPE, FUEL, NO.4"


In [10]:
files =  'data/declaration.pkl'
save_df(data,files)

Save DataFrame...
success


In [7]:
files = 'data/train/df_master_section.pkl'

save_df(data,files)

Save DataFrame...
success
