In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score ,accuracy_score

pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)

In [2]:
urls = 'https://github.com/dragon-library/work_space/raw/main/HS_Code/HS/hs_code.xlsx'

def get_master(sheets):
    data = pd.read_excel(urls,sheet_name= sheets)
    data['chapter'] = data['chapter'].map('{:02}'.format)
    data = data[['chapter','description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
    
    return data

In [3]:
sheets = '8_digit'
inputs = get_master(sheets)

sheets = '2_digit'
master = get_master(sheets)

sheets = 'deci'
deci = get_master(sheets)

sheets = 'deci_cti'
deci_cti = get_master(sheets)

sheets = 'decx'
decx = get_master(sheets)

train_df = pd.concat([inputs,master,deci_cti,deci,decx], ignore_index=True)
train_df.sample(10)

Unnamed: 0,chapter,description
37409,62,lamination molding foam cup
70395,40,"gasket assembly,door\r\n4987ja1020e"
41771,58,elastic webbing
19667,87,porsche cayenne e-hybrid 2019 2995cc:462hp:340...
47048,62,lamination molding foam cup
43853,60,simplex knitted fabric
50501,73,bra wire
4939,58,"fabrics; narrow woven fabrics, n.e.c. in headi..."
92733,84,washing machine
30163,62,polyurethane cup with lamination


In [5]:
file = train_df.copy()

arr_text=file.iloc[:]['description'].values
arr_class=file.iloc[:]['chapter'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(arr_text, arr_class, test_size=0.33, random_state=42)

In [6]:
vect = CountVectorizer()
vect.fit(arr_text)

CountVectorizer()

In [7]:
train_transformer_x = vect.transform(X_train)
detect_model = MultinomialNB().fit(train_transformer_x,y_train)

In [8]:
test_transformer_x = vect.transform(X_test)
y_pred = detect_model.predict(test_transformer_x)
print('F1 score =',f1_score(y_test, y_pred, average='macro'))
print('Accuracy =',accuracy_score(y_test, y_pred))

F1 score = 0.6271188955720458
Accuracy = 0.9189248115617019


In [10]:
df =  train_df.copy()



arr_text=df.iloc[:]['description'].values
arr_text

array(['horses; live, purebred breeding animals - purebred breeding animals',
       'horses; live, other than purebred breeding animals - other',
       'asses; live - other', ..., 'compressor', 'washing machine',
       'condenser assembly'], dtype=object)

In [11]:
#CountVectorizer ทำ tokenize

vect = CountVectorizer(stop_words='english', lowercase=True)
vect

CountVectorizer(stop_words='english')

In [12]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)
count_train

CountVectorizer(stop_words='english')

In [13]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)

#ดู vocabulary_

vect.vocabulary_

{'horses': 13646,
 'live': 14644,
 'purebred': 17591,
 'breeding': 7594,
 'animals': 6701,
 'asses': 6876,
 'mules': 16074,
 'hinnies': 13580,
 'cattle': 8036,
 'oxen': 16621,
 'buffalo': 7675,
 'bovine': 7537,
 'swine': 19484,
 'weighing': 20759,
 '50kg': 4085,
 '50': 4008,
 'kg': 14263,
 'sheep': 18629,
 'goats': 13135,
 'poultry': 17349,
 'fowls': 12839,
 'species': 19016,
 'gallus': 12968,
 'domesticus': 9745,
 '185g': 1997,
 'turkeys': 20248,
 'ducks': 9853,
 'ducklings': 9852,
 'geese': 13020,
 'guinea': 13284,
 'fighting': 12587,
 'cocks': 8462,
 'mammals': 14969,
 'primates': 17432,
 'whales': 20779,
 'dolphins': 9743,
 'porpoises': 17320,
 'order': 16542,
 'cetacea': 8108,
 'manatees': 14986,
 'dugongs': 9855,
 'sirenia': 18778,
 'seals': 18466,
 'sea': 18459,
 'lions': 14613,
 'walruses': 20681,
 'suborder': 19348,
 'pinnipedia': 17089,
 'camels': 7849,
 'camelids': 7848,
 'camelidae': 7847,
 'rabbits': 17717,
 'hares': 13455,
 'reptiles': 17970,
 'including': 13856,
 'snakes

In [14]:
#ทำ transform ให้อยู่ในรูปแบบ One Hot Encoding

transformer = vect.transform(arr_text)
transformer

<97288x22572 sparse matrix of type '<class 'numpy.int64'>'
	with 411681 stored elements in Compressed Sparse Row format>

In [15]:
#ดูผลลัพท์ matrix ที่ได้โดย array จะประกอบด้วย context ใน 1 paragraph หากมี word ใน context ให้ค่าเป็น 1 และหากไม่มี word ใน context ให้ค่าเป็น 0

print(transformer.toarray())

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [17]:
# เตรียม tf-idf

from sklearn.feature_extraction.text import TfidfTransformer
# Config tf-idf

tfidf_transformer = TfidfTransformer(smooth_idf=False,).fit(transformer)

#show ค่า idf

tfidf_transformer.idf_

array([ 8.59361063,  7.26507511, 10.28820635, ..., 10.18284584,
       10.18284584, 10.18284584])

In [18]:
from sklearn.naive_bayes import MultinomialNB

#เตรียม Class ในรูปแบบ array เหมือน arr_text

arr_class=df.iloc[:]['chapter'].values
arr_class

array(['01', '01', '01', ..., '84', '84', '84'], dtype=object)

In [19]:
#เตรียม train จาก transformer

messages_tfidf = tfidf_transformer.transform(transformer)
print (messages_tfidf.shape)

(97288, 22572)


In [20]:
messages_tfidf.toarray()

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [21]:
#สร้าง Model

detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split( messages_tfidf,arr_class, test_size = 0.2, random_state=7)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

detect_model =MultinomialNB().fit(x_train,y_train)
#detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

(77830, 22572) (19458, 22572) (77830,) (19458,)


In [22]:
print(detect_model.score(x_test, y_test))

0.8902251002158496


## ทดสอบการทำนาย

In [23]:
products = 'horses; live'

In [34]:
products = 'pencil'

In [35]:
t0=vect.transform([products])

#แปลงเป็น vector

txt = t0.toarray()

In [36]:
print ('Predicted: ',detect_model.predict(txt) )

Predicted:  ['84']


In [26]:
#เห็นว่าคำทำนายออกมาเป็น london ตรงกับ Expected
#นำ data ทั้งหมดที่ train ไป test มันก็ควรจะตรง(Expected คือ 100%)

all_predictions = detect_model.predict(messages_tfidf)
print(all_predictions)

['85' '85' '84' ... '84' '84' '84']


In [27]:
#ดูค่า Performance

from sklearn.metrics import classification_report
print (classification_report(arr_class, all_predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          01       1.00      0.18      0.31        49
          02       0.98      0.94      0.96       315
          03       0.86      0.98      0.92       410
          04       0.95      0.65      0.77        65
          05       0.00      0.00      0.00        27
          06       0.00      0.00      0.00        28
          07       1.00      0.61      0.76       117
          08       1.00      0.22      0.36        95
          09       1.00      0.63      0.77       110
          10       1.00      0.25      0.41       110
          11       0.00      0.00      0.00        41
          12       1.00      0.21      0.34        82
          13       1.00      0.45      0.62        66
          14       0.00      0.00      0.00        14
          15       1.00      0.88      0.94       199
          16       1.00      0.48      0.65        83
          17       0.00      0.00      0.00        43
          18       0.00    