In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score ,accuracy_score

pd.set_option('display.max_rows', 20, 
              'display.max_columns', 100)

In [2]:
def get_master(sheets):
    data = pd.read_excel('data/master.xlsx',sheet_name= sheets)
    data['heading'] = data['heading'].map('{:04}'.format)
    data = data[['heading','product_desc']]
    data['product_desc'] = data['product_desc'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
    
    return data

In [3]:
sheets = 'master'
master = get_master(sheets)
sheets = 'deci'
deci = get_master(sheets)
train_df = pd.concat([master,deci], ignore_index=True)

train_df

Unnamed: 0,heading,product_desc
0,0101,"horses; live, pure-bred breeding animals"
1,0101,"horses; live, other than pure-bred breeding an..."
2,0101,"asses, mules and hinnies; live"
3,0102,"bovine animals; live, pure-bred breeding animals"
4,0102,"bovine animals; live, other than pure-bred bre..."
...,...,...
54387,5407,textile piece goods\r\n95% polyester 5% spande...
54388,5407,textile piece goods\r\n95% polyester 5% spande...
54389,5407,textile piece goods\r\n95% polyester 5% spande...
54390,5407,textile piece goods\r\n100% polyester\r\n75d*7...


In [4]:
sheets = 'decx'
test_df = get_master(sheets)

test_df

Unnamed: 0,heading,product_desc
0,5007,100% silk fabrics\nrobin silk plaid/roman ston...
1,5007,"100% silk fabrics\nsilk twist/watermelon #312 48"""
2,5211,55%cotton 45%silk fabrics\nmetro/grey flannel ...
3,5211,55%cotton 45%silk fabrics\npalazzo/grey flanne...
4,5211,55%cotton 45%silk fabrics\nmetro/rouge chinois...
...,...,...
22220,8414,compressor
22221,8414,compressor
22222,8414,compressor
22223,8450,washing machine


In [6]:
file = train_df.copy()

arr_text=file.iloc[:]['product_desc'].values
arr_class=file.iloc[:]['heading'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(arr_text, arr_class, test_size=0.33, random_state=42)

In [7]:
vect = CountVectorizer()
vect.fit(arr_text)

CountVectorizer()

In [8]:
train_transformer_x = vect.transform(X_train)
detect_model = MultinomialNB().fit(train_transformer_x,y_train)

In [9]:
test_transformer_x = vect.transform(X_test)
y_pred = detect_model.predict(test_transformer_x)
print('F1 score =',f1_score(y_test, y_pred, average='macro'))
print('Accuracy =',accuracy_score(y_test, y_pred))

F1 score = 0.5750728536648858
Accuracy = 0.8021169916434541


In [10]:
df =  train_df.copy()

#แต่จริงแล้วดึงมาเฉพาะ field text

arr_text=df.iloc[:]['product_desc'].values
arr_text

array(['horses; live, pure-bred breeding animals',
       'horses; live, other than pure-bred breeding animals',
       'asses, mules and hinnies; live', ...,
       'textile piece goods\r\n95% polyester 5% spandex \r\n150dx150d+40d 58/60"',
       'textile piece goods\r\n100% polyester\r\n75d*75d 58/60"',
       'textile piece goods\r\n95% polyester 5% spandex\r\n175d+40d*200d+40d 168*78 58/60"'],
      dtype=object)

In [15]:
#CountVectorizer ทำ tokenize

vect = CountVectorizer(stop_words='english', lowercase=True)
vect

CountVectorizer(stop_words='english')

In [16]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)
count_train

CountVectorizer(stop_words='english')

In [17]:
#เริ่มตัดคำ

count_train = vect.fit(arr_text)

#ดู vocabulary_

vect.vocabulary_

{'horses': 7862,
 'live': 8435,
 'pure': 10406,
 'bred': 3543,
 'breeding': 3545,
 'animals': 3062,
 'asses': 3159,
 'mules': 9523,
 'hinnies': 7825,
 'bovine': 3508,
 'swine': 11499,
 'weighing': 12316,
 '50kg': 1125,
 'sheep': 11007,
 'goats': 7591,
 'poultry': 10270,
 'fowls': 7408,
 'species': 11227,
 'gallus': 7484,
 'domesticus': 4803,
 '185g': 240,
 'ducks': 4861,
 'geese': 7517,
 'turkeys': 12031,
 'guinea': 7676,
 'chapter': 3880,
 'meat': 9000,
 'carcasses': 3755,
 'half': 7713,
 'fresh': 7428,
 'chilled': 3914,
 'cuts': 4502,
 'bone': 3482,
 'excluding': 7119,
 'boneless': 3483,
 'frozen': 7437,
 'hams': 7723,
 'shoulders': 11036,
 'thereof': 11773,
 'item': 8127,
 '0203': 28,
 'lamb': 8282,
 'including': 7982,
 'offal': 9696,
 'edible': 6953,
 'tongues': 11867,
 'livers': 8437,
 'cut': 4499,
 'pieces': 10066,
 'fatty': 7214,
 'rabbits': 10497,
 'hares': 7743,
 'frogs': 7433,
 'legs': 8345,
 'fat': 7212,
 'pig': 10069,
 'free': 7420,
 'lean': 8333,
 'rendered': 10646,
 'salt

In [18]:
#ทำ transform ให้อยู่ในรูปแบบ One Hot Encoding

transformer = vect.transform(arr_text)
transformer

<54392x12451 sparse matrix of type '<class 'numpy.int64'>'
	with 356255 stored elements in Compressed Sparse Row format>

In [19]:
#ดูผลลัพท์ matrix ที่ได้โดย array จะประกอบด้วย context ใน 1 paragraph หากมี word ใน context ให้ค่าเป็น 1 และหากไม่มี word ใน context ให้ค่าเป็น 0

print(transformer.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
# เตรียม tf-idf

from sklearn.feature_extraction.text import TfidfTransformer
# Config tf-idf

tfidf_transformer = TfidfTransformer(smooth_idf=False,).fit(transformer)

#show ค่า idf

tfidf_transformer.idf_

array([ 9.82453082, 11.21082518, 10.517678  , ..., 10.517678  ,
       10.11221289,  9.13138364])

In [21]:
from sklearn.naive_bayes import MultinomialNB

#เตรียม Class ในรูปแบบ array เหมือน arr_text

arr_class=df.iloc[:]['heading'].values
arr_class

array(['0101', '0101', '0101', ..., '5407', '5407', '5407'], dtype=object)

In [22]:
#เตรียม train จาก transformer

messages_tfidf = tfidf_transformer.transform(transformer)
print (messages_tfidf.shape)

(54392, 12451)


In [23]:
messages_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
#สร้าง Model

detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split( messages_tfidf,arr_class, test_size=0.1 , random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

detect_model =MultinomialNB().fit(x_train,y_train)
#detect_model = MultinomialNB().fit(messages_tfidf,arr_class)

(48952, 12451) (5440, 12451) (48952,) (5440,)


In [25]:
print(detect_model.score(x_test, y_test))

0.6959558823529411


In [36]:
products = 'pure-bred breeding animals'

In [37]:
t0=vect.transform([products])

#แปลงเป็น vector

txt = t0.toarray()

In [38]:
print ('Predicted: ',detect_model.predict(txt) )

Predicted:  ['0103']


In [31]:
#เห็นว่าคำทำนายออกมาเป็น london ตรงกับ Expected
#นำ data ทั้งหมดที่ train ไป test มันก็ควรจะตรง(Expected คือ 100%)

all_predictions = detect_model.predict(messages_tfidf)
print(all_predictions)

['8537' '8537' '8537' ... '5407' '8537' '5407']


In [32]:
#ดูค่า Performance

from sklearn.metrics import classification_report
print (classification_report(arr_class, all_predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        0101       1.00      0.22      0.36        18
        0102       1.00      0.89      0.94        18
        0103       1.00      1.00      1.00        18
        0104       0.00      0.00      0.00        12
        0105       1.00      1.00      1.00        35
        0106       0.91      0.45      0.61        44
        0201       0.00      0.00      0.00        18
        0202       0.00      0.00      0.00        18
        0203       1.00      0.50      0.67        36
        0204       0.57      0.89      0.70        54
        0205       0.00      0.00      0.00         6
        0206       0.90      1.00      0.95        54
        0207       0.82      1.00      0.90        87
        0208       1.00      0.07      0.13        29
        0209       0.00      0.00      0.00         8
        0210       1.00      1.00      1.00        42
        0301       0.00      0.00      0.00        38
        0302       0.65    