In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score ,accuracy_score

pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)

In [2]:
def get_master(sheets):
    data = pd.read_excel('data/master.xlsx',sheet_name= sheets)
    data['chapter'] = data['chapter'].map('{:02}'.format)
    data = data[['chapter','product_desc']]
    data['product_desc'] = data['product_desc'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
    
    return data

In [3]:
sheets = 'master'
master = get_master(sheets)
sheets = 'deci'
deci = get_master(sheets)
sheets = 'decx'
decx = get_master(sheets)
train_df = pd.concat([master,deci,decx], ignore_index=True)

train_df

Unnamed: 0,chapter,product_desc
0,01,"horses; live, pure-bred breeding animals"
1,01,"horses; live, other than pure-bred breeding an..."
2,01,"asses, mules and hinnies; live"
3,01,"bovine animals; live, pure-bred breeding animals"
4,01,"bovine animals; live, other than pure-bred bre..."
...,...,...
76612,84,compressor
76613,84,compressor
76614,84,compressor
76615,84,washing machine


In [4]:
file = train_df.copy()

arr_text=file.iloc[:]['product_desc'].values
arr_class=file.iloc[:]['chapter'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(arr_text, arr_class, test_size=0.33, random_state=42)

In [5]:
vect = CountVectorizer()
vect.fit(arr_text)

CountVectorizer()

In [6]:
train_transformer_x = vect.transform(X_train)
detect_model = MultinomialNB().fit(train_transformer_x,y_train)

In [7]:
test_transformer_x = vect.transform(X_test)
y_pred = detect_model.predict(test_transformer_x)
print('F1 score =',f1_score(y_test, y_pred, average='macro'))
print('Accuracy =',accuracy_score(y_test, y_pred))

F1 score = 0.8424668517085825
Accuracy = 0.9231134314190793


In [8]:
df =  train_df.copy()

#แต่จริงแล้วดึงมาเฉพาะ field text

arr_text=df.iloc[:]['product_desc'].values
arr_text

array(['horses; live, pure-bred breeding animals',
       'horses; live, other than pure-bred breeding animals',
       'asses, mules and hinnies; live', ..., 'compressor',
       'washing machine', 'condenser assembly'], dtype=object)

In [9]:
#CountVectorizer ทำ tokenize

vect = CountVectorizer(stop_words='english', lowercase=True)
vect

CountVectorizer(stop_words='english')

In [10]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)
count_train

CountVectorizer(stop_words='english')

In [11]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)

#ดู vocabulary_

vect.vocabulary_

{'horses': 8212,
 'live': 8820,
 'pure': 10864,
 'bred': 3767,
 'breeding': 3769,
 'animals': 3255,
 'asses': 3363,
 'mules': 9941,
 'hinnies': 8175,
 'bovine': 3732,
 'swine': 11999,
 'weighing': 12854,
 '50kg': 1237,
 'sheep': 11490,
 'goats': 7921,
 'poultry': 10724,
 'fowls': 7734,
 'species': 11717,
 'gallus': 7813,
 'domesticus': 5083,
 '185g': 316,
 'ducks': 5148,
 'geese': 7847,
 'turkeys': 12545,
 'guinea': 8011,
 'chapter': 4124,
 'meat': 9406,
 'carcasses': 3990,
 'half': 8057,
 'fresh': 7754,
 'chilled': 4161,
 'cuts': 4774,
 'bone': 3705,
 'excluding': 7431,
 'boneless': 3706,
 'frozen': 7763,
 'hams': 8069,
 'shoulders': 11523,
 'thereof': 12281,
 'item': 8495,
 '0203': 36,
 'lamb': 8663,
 'including': 8345,
 'offal': 10117,
 'edible': 7250,
 'tongues': 12376,
 'livers': 8822,
 'cut': 4771,
 'pieces': 10511,
 'fatty': 7530,
 'rabbits': 10956,
 'hares': 8089,
 'frogs': 7759,
 'legs': 8727,
 'fat': 7528,
 'pig': 10514,
 'free': 7746,
 'lean': 8715,
 'rendered': 11106,
 'sal

In [12]:
#ทำ transform ให้อยู่ในรูปแบบ One Hot Encoding

transformer = vect.transform(arr_text)
transformer

<76617x13004 sparse matrix of type '<class 'numpy.int64'>'
	with 417072 stored elements in Compressed Sparse Row format>

In [13]:
#ดูผลลัพท์ matrix ที่ได้โดย array จะประกอบด้วย context ใน 1 paragraph หากมี word ใน context ให้ค่าเป็น 1 และหากไม่มี word ใน context ให้ค่าเป็น 0

print(transformer.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [14]:
# เตรียม tf-idf

from sklearn.feature_extraction.text import TfidfTransformer
# Config tf-idf

tfidf_transformer = TfidfTransformer(smooth_idf=False,).fit(transformer)

#show ค่า idf

tfidf_transformer.idf_

array([10.16713272, 11.55342708, 10.8602799 , ...,  9.94398917,
        9.94398917,  9.94398917])

In [15]:
from sklearn.naive_bayes import MultinomialNB

#เตรียม Class ในรูปแบบ array เหมือน arr_text

arr_class=df.iloc[:]['chapter'].values
arr_class

array(['01', '01', '01', ..., '84', '84', '84'], dtype=object)

In [16]:
#เตรียม train จาก transformer

messages_tfidf = tfidf_transformer.transform(transformer)
print (messages_tfidf.shape)

(76617, 13004)


In [17]:
messages_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
#สร้าง Model

detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split( messages_tfidf,arr_class, test_size = 0.2, random_state=7)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

detect_model =MultinomialNB().fit(x_train,y_train)
#detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

(61293, 13004) (15324, 13004) (61293,) (15324,)


In [19]:
print(detect_model.score(x_test, y_test))

0.8983294179065519


## ทดสอบการทำนาย

In [20]:
products = 'horses; live'

In [25]:
products = 'mango'

In [26]:
t0=vect.transform([products])

#แปลงเป็น vector

txt = t0.toarray()

In [27]:
print ('Predicted: ',detect_model.predict(txt) )

Predicted:  ['84']


In [23]:
#เห็นว่าคำทำนายออกมาเป็น london ตรงกับ Expected
#นำ data ทั้งหมดที่ train ไป test มันก็ควรจะตรง(Expected คือ 100%)

all_predictions = detect_model.predict(messages_tfidf)
print(all_predictions)

['01' '01' '02' ... '84' '84' '84']


In [24]:
#ดูค่า Performance

from sklearn.metrics import classification_report
print (classification_report(arr_class, all_predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          01       1.00      0.94      0.97       145
          02       0.96      0.99      0.98       352
          03       0.93      1.00      0.97       787
          04       1.00      0.93      0.96       170
          05       1.00      0.57      0.73        96
          06       1.00      0.87      0.93        85
          07       0.97      1.00      0.98       366
          08       0.99      0.98      0.99       350
          09       1.00      0.90      0.95       206
          10       1.00      0.57      0.73       116
          11       0.97      0.74      0.84       179
          12       0.95      0.95      0.95       266
          13       1.00      0.27      0.42        67
          14       0.00      0.00      0.00        44
          15       0.97      0.92      0.94       287
          16       1.00      0.88      0.93       186
          17       1.00      0.98      0.99        97
          18       1.00    