In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score ,accuracy_score



In [2]:
def get_master(sheets):
    data = pd.read_excel('HS/hs_code.xlsx',sheet_name= sheets)
    data['chapter'] = data['chapter'].map('{:02}'.format)
    data = data[['chapter','description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
    
    return data

In [3]:
sheets = '8_digit'
inputs = get_master(sheets)
sheets = '2_digit'
master = get_master(sheets)
sheets = 'deci'
deci = get_master(sheets)
sheets = 'decx'
decx = get_master(sheets)
train_df = pd.concat([inputs,master,deci,decx], ignore_index=True)

In [4]:
file = train_df.copy()

arr_text=file.iloc[:]['description'].values
arr_class=file.iloc[:]['chapter'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(arr_text, arr_class, test_size=0.33, random_state=42)

In [5]:
vect = CountVectorizer()
vect.fit(arr_text)

CountVectorizer()

In [6]:
train_transformer_x = vect.transform(X_train)
detect_model = MultinomialNB().fit(train_transformer_x,y_train)

In [7]:
test_transformer_x = vect.transform(X_test)
y_pred = detect_model.predict(test_transformer_x)
print('F1 score =',f1_score(y_test, y_pred, average='macro'))
print('Accuracy =',accuracy_score(y_test, y_pred))

F1 score = 0.624658300963496
Accuracy = 0.9110512129380054


In [8]:
df =  train_df.copy()

#แต่จริงแล้วดึงมาเฉพาะ field text

arr_text=df.iloc[:]['description'].values
arr_text

array(['horses; live, purebred breeding animals - purebred breeding animals',
       'horses; live, other than purebred breeding animals - other',
       'asses; live - other', ..., 'compressor', 'washing machine',
       'condenser assembly'], dtype=object)

In [9]:
#CountVectorizer ทำ tokenize

vect = CountVectorizer(stop_words='english', lowercase=True)
vect

CountVectorizer(stop_words='english')

In [10]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)
count_train

CountVectorizer(stop_words='english')

In [11]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)

#ดู vocabulary_

vect.vocabulary_

{'horses': 8843,
 'live': 9557,
 'purebred': 11948,
 'breeding': 3943,
 'animals': 3324,
 'asses': 3469,
 'mules': 10754,
 'hinnies': 8800,
 'cattle': 4285,
 'oxen': 11165,
 'buffalo': 4008,
 'bovine': 3898,
 'swine': 13349,
 'weighing': 14300,
 '50kg': 1262,
 '50': 1222,
 'kg': 9270,
 'sheep': 12708,
 'goats': 8488,
 'poultry': 11753,
 'fowls': 8270,
 'species': 12996,
 'gallus': 8362,
 'domesticus': 5484,
 '185g': 315,
 'turkeys': 13951,
 'ducks': 5564,
 'ducklings': 5563,
 'geese': 8404,
 'guinea': 8599,
 'fighting': 8071,
 'cocks': 4606,
 'mammals': 9766,
 'primates': 11819,
 'whales': 14316,
 'dolphins': 5482,
 'porpoises': 11728,
 'order': 11096,
 'cetacea': 4335,
 'manatees': 9782,
 'dugongs': 5566,
 'sirenia': 12826,
 'seals': 12575,
 'sea': 12568,
 'lions': 9530,
 'walruses': 14234,
 'suborder': 13244,
 'pinnipedia': 11533,
 'camels': 4131,
 'camelids': 4130,
 'camelidae': 4129,
 'rabbits': 12044,
 'hares': 8697,
 'reptiles': 12251,
 'including': 9001,
 'snakes': 12907,
 'turt

In [12]:
#ทำ transform ให้อยู่ในรูปแบบ One Hot Encoding

transformer = vect.transform(arr_text)
transformer

<56212x14483 sparse matrix of type '<class 'numpy.int64'>'
	with 253945 stored elements in Compressed Sparse Row format>

In [13]:
#ดูผลลัพท์ matrix ที่ได้โดย array จะประกอบด้วย context ใน 1 paragraph หากมี word ใน context ให้ค่าเป็น 1 และหากไม่มี word ใน context ให้ค่าเป็น 0

print(transformer.toarray())

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [14]:
# เตรียม tf-idf

from sklearn.feature_extraction.text import TfidfTransformer
# Config tf-idf

tfidf_transformer = TfidfTransformer(smooth_idf=False,).fit(transformer)

#show ค่า idf

tfidf_transformer.idf_

array([8.80139132, 6.77209956, 9.73966096, ..., 9.63430044, 9.63430044,
       9.63430044])

In [15]:
from sklearn.naive_bayes import MultinomialNB

#เตรียม Class ในรูปแบบ array เหมือน arr_text

arr_class=df.iloc[:]['chapter'].values
arr_class

array(['01', '01', '01', ..., '84', '84', '84'], dtype=object)

In [16]:
#เตรียม train จาก transformer

messages_tfidf = tfidf_transformer.transform(transformer)
print (messages_tfidf.shape)

(56212, 14483)


In [17]:
messages_tfidf.toarray()

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [18]:
#สร้าง Model

detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split( messages_tfidf,arr_class, test_size = 0.3, random_state=7)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

detect_model =MultinomialNB().fit(x_train,y_train)
#detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

(39348, 14483) (16864, 14483) (39348,) (16864,)


In [19]:
print(detect_model.score(x_test, y_test))

0.8642670777988615


## ทดสอบการทำนาย

In [20]:
products = 'horses; live'

In [38]:
products = 'mango'

In [21]:
t0=vect.transform([products])

#แปลงเป็น vector

txt = t0.toarray()

In [22]:
print ('Predicted: ',detect_model.predict(txt) )

Predicted:  ['03']


In [23]:
#เห็นว่าคำทำนายออกมาเป็น london ตรงกับ Expected
#นำ data ทั้งหมดที่ train ไป test มันก็ควรจะตรง(Expected คือ 100%)

all_predictions = detect_model.predict(messages_tfidf)
print(all_predictions)

['85' '85' '84' ... '84' '84' '84']


In [24]:
#ดูค่า Performance

from sklearn.metrics import classification_report
print (classification_report(arr_class, all_predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          01       1.00      0.31      0.47        49
          02       1.00      0.21      0.34        72
          03       0.79      1.00      0.88       365
          04       1.00      0.75      0.86        65
          05       0.00      0.00      0.00        25
          06       0.00      0.00      0.00        28
          07       1.00      0.62      0.77       117
          08       1.00      0.21      0.35        95
          09       1.00      0.46      0.63        63
          10       0.00      0.00      0.00        37
          11       0.00      0.00      0.00        40
          12       1.00      0.23      0.37        80
          13       0.00      0.00      0.00        22
          14       0.00      0.00      0.00        14
          15       0.98      0.89      0.93       175
          16       1.00      0.70      0.82        77
          17       0.00      0.00      0.00        32
          18       0.00    