In [1]:
import numpy as np
import pandas as pd
import gdown
import jieba

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
df_train = pd.read_csv("C:/Users/CHARLIE/Desktop/文字探勘/文本分類練習/ptt_data_10k_train.csv")
df_train

Unnamed: 0,id,board,title
0,0,Baseball,[討論] 龍隊2019真選的好
1,1,Baseball,[分享] 中職33年失誤塔 ~07/03
2,2,Baseball,[分享] 今日龍喵戰 8上 喵喵瘦邱滿壘連3K拆彈
3,3,Baseball,[分享] 鄭凱文本週未上場
4,4,Baseball,[分享] 中職33年HR塔 ~07/03 Week14
...,...,...,...
9995,9995,Stock,[新聞] 5年吸金7.4億！富二代+精算師組「東吳幫5
9996,9996,Stock,[閒聊] 2022/05/11 盤後閒聊
9997,9997,Stock,[新聞] 台股最壞已過？ 16檔營收創高股 找買點
9998,9998,Stock,[情報] 1731 美吾華 Q1


In [7]:
df_train['label'] = df_train['board'].map({'Stock': 1, 'Gossiping': 2, 'Baseball': 3, 'Lifeismoney': 4, 'C_Chat': 5})
df_train

Unnamed: 0,id,board,title,label
0,0,Baseball,[討論] 龍隊2019真選的好,3
1,1,Baseball,[分享] 中職33年失誤塔 ~07/03,3
2,2,Baseball,[分享] 今日龍喵戰 8上 喵喵瘦邱滿壘連3K拆彈,3
3,3,Baseball,[分享] 鄭凱文本週未上場,3
4,4,Baseball,[分享] 中職33年HR塔 ~07/03 Week14,3
...,...,...,...,...
9995,9995,Stock,[新聞] 5年吸金7.4億！富二代+精算師組「東吳幫5,1
9996,9996,Stock,[閒聊] 2022/05/11 盤後閒聊,1
9997,9997,Stock,[新聞] 台股最壞已過？ 16檔營收創高股 找買點,1
9998,9998,Stock,[情報] 1731 美吾華 Q1,1


In [9]:
df_train['title_segment'] = [jieba.lcut(sent) for sent in df_train['title']]
df_train['title_segment'] = df_train['title_segment'].apply(lambda x:' '.join(x))
df_train.head(5)

Unnamed: 0,id,board,title,label,title_segment
0,0,Baseball,[討論] 龍隊2019真選的好,3,[ 討論 ] 龍隊 2019 真選 的 好
1,1,Baseball,[分享] 中職33年失誤塔 ~07/03,3,[ 分享 ] 中職 33 年 失誤 塔 ~ 07 / 03
2,2,Baseball,[分享] 今日龍喵戰 8上 喵喵瘦邱滿壘連3K拆彈,3,[ 分享 ] 今日 龍 喵 戰 8 上 喵 喵 瘦 邱滿壘連 3K 拆彈
3,3,Baseball,[分享] 鄭凱文本週未上場,3,[ 分享 ] 鄭凱 文本 週未 上場
4,4,Baseball,[分享] 中職33年HR塔 ~07/03 Week14,3,[ 分享 ] 中職 33 年 HR 塔 ~ 07 / 03 Week14


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df_train['title_segment'], df_train['label'], test_size=0.33, random_state=42)
print(y_train.shape)
print(y_test.shape)

(6700,)
(3300,)


In [12]:
def show_result(predicted, predicted_proba, target):
    print('*'*50)
    print('predicted class of first 3 test data')
    print(predicted[:3])

    print('*'*50)
    print('predicted class proba. of first 3 test data')
    print(predicted_proba[:3])

    np.mean(predicted == target)
    print('*'*50)
    print('accuracy performance on test data')
    print(np.mean(predicted == target))

# **Feature vectorization**

**Using CountVectorizer**

In [13]:
# create feature vectors
count_vect = CountVectorizer(max_features=1000) # max_features=130107
X_train_counts = count_vect.fit_transform(X_train)

#prints the train data shape
print('train data shape using CountVectorizer')
print(X_train_counts.shape)

#prints the test data shape
X_test_counts = count_vect.transform(X_test)
print('test data shape using CountVectorizer')
print(X_test_counts.shape)

train data shape using CountVectorizer
(6700, 1000)
test data shape using CountVectorizer
(3300, 1000)


**Using TfidfVectorizer**

In [14]:
# create feature vectors
tfidf_vect = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vect.fit_transform(X_train)

#prints the train data shape
print('train data shape using TfidfVectorizer')
print(X_train_tfidf.shape)

#prints the test data shape
X_test_tfidf = tfidf_vect.transform(X_test)
print('test data shape using TfidfVectorizer')
print(X_test_tfidf.shape)

train data shape using TfidfVectorizer
(6700, 1000)
test data shape using TfidfVectorizer
(3300, 1000)


# **Create classifier**

**Naive Bayes classifier with CountVectorizer**

In [15]:
# Create classifier and use count vectors
MultinomialNB_clf = MultinomialNB()
print('*'*50)
print('MultinomialNB classifier with CountVectorizer')
print(MultinomialNB_clf)

# fit train data
MultinomialNB_clf.fit(X_train_counts, y_train)

# predict the class and class proba.
predicted = MultinomialNB_clf.predict(X_test_counts)
predicted_proba = MultinomialNB_clf.predict_proba(X_test_counts)

show_result(predicted, predicted_proba, y_test)

**************************************************
MultinomialNB classifier with CountVectorizer
MultinomialNB()
**************************************************
predicted class of first 3 test data
[4 2 3]
**************************************************
predicted class proba. of first 3 test data
[[6.36606174e-04 1.26315715e-06 2.50758415e-04 9.98969726e-01
  1.41646654e-04]
 [1.58002593e-01 6.54539316e-01 1.41144175e-01 3.69069383e-02
  9.40697794e-03]
 [1.42567536e-03 7.32995846e-04 9.16048219e-01 6.52838109e-05
  8.17278262e-02]]
**************************************************
accuracy performance on test data
0.8978787878787878


**Naive Bayes classifier with TfidfVectorizer**

In [16]:
# Create classifier and use tf-idf vectors
MultinomialNB_clf = MultinomialNB()
print('*'*50)
print('MultinomialNB classifier with TfidfVectorizer')
print(MultinomialNB_clf)

# fit train data
MultinomialNB_clf.fit(X_train_tfidf, y_train)

# predict the class and class proba.
predicted = MultinomialNB_clf.predict(X_test_tfidf)
predicted_proba = MultinomialNB_clf.predict_proba(X_test_tfidf)

show_result(predicted, predicted_proba, y_test)

**************************************************
MultinomialNB classifier with TfidfVectorizer
MultinomialNB()
**************************************************
predicted class of first 3 test data
[4 2 3]
**************************************************
predicted class proba. of first 3 test data
[[0.01835198 0.0054261  0.0153065  0.94730248 0.01361294]
 [0.1377165  0.50853325 0.12590549 0.16756342 0.06028135]
 [0.03923814 0.04964457 0.65587961 0.01586171 0.23937597]]
**************************************************
accuracy performance on test data
0.9006060606060606


**KNN classifier with CountVectorizer**

In [17]:
# Create classifier and use count vectors
KNeighborsClassifier_clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
print('*'*50)
print('KNeighbors classifier with CountVectorizer')
print(KNeighborsClassifier_clf)

# fit train data
KNeighborsClassifier_clf.fit(X_train_counts, y_train)

# predict the class and class proba.
predicted = KNeighborsClassifier_clf.predict(X_test_counts)
predicted_proba = KNeighborsClassifier_clf.predict_proba(X_test_counts)

show_result(predicted, predicted_proba, y_test)

**************************************************
KNeighbors classifier with CountVectorizer
KNeighborsClassifier(weights='distance')
**************************************************
predicted class of first 3 test data
[4 2 3]
**************************************************
predicted class proba. of first 3 test data
[[0.  0.  0.  1.  0. ]
 [0.  0.5 0.  0.5 0. ]
 [0.  0.  1.  0.  0. ]]
**************************************************
accuracy performance on test data
0.8575757575757575


**KNN classifier with TfidfVectorizer**

In [18]:
# Create classifier and use tf-idf vectors
KNeighborsClassifier_clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
print('*'*50)
print('KNeighbors classifier with TfidfVectorizer')
print(KNeighborsClassifier_clf)

# fit train data
KNeighborsClassifier_clf.fit(X_train_tfidf, y_train)

# predict the class and class proba.
predicted = KNeighborsClassifier_clf.predict(X_test_tfidf)
predicted_proba = KNeighborsClassifier_clf.predict_proba(X_test_tfidf)

show_result(predicted, predicted_proba, y_test)

**************************************************
KNeighbors classifier with TfidfVectorizer
KNeighborsClassifier(weights='distance')
**************************************************
predicted class of first 3 test data
[4 2 3]
**************************************************
predicted class proba. of first 3 test data
[[0.         0.         0.         1.         0.        ]
 [0.         0.5        0.         0.5        0.        ]
 [0.         0.         0.78953298 0.21046702 0.        ]]
**************************************************
accuracy performance on test data
0.8306060606060606


**SVM classifier with CountVectorizer**

In [19]:
# Create classifier and use count vectors
SVC_clf = SVC(probability=True)
print('*'*50)
print('SVM classifier with CountVectorizer')
print(SVC_clf)

# fit train data
SVC_clf.fit(X_train_counts, y_train)

# predict the class and class proba.
predicted = SVC_clf.predict(X_test_counts)
predicted_proba = SVC_clf.predict_proba(X_test_counts)

print('*'*50)
print('predicted class of first 3 test data')
print(predicted[:3])

print('*'*50)
print('predicted class proba. of first 3 test data')
print(predicted_proba[:3])

np.mean(predicted == y_test)
print('*'*50)
print('accuracy performance on test data')
print(np.mean(predicted == y_test))

**************************************************
SVM classifier with CountVectorizer
SVC(probability=True)
**************************************************
predicted class of first 3 test data
[4 2 3]
**************************************************
predicted class proba. of first 3 test data
[[5.18583655e-03 7.09946276e-05 5.84816675e-03 9.80991401e-01
  7.90360088e-03]
 [5.46473691e-02 9.23386149e-01 1.22091871e-02 5.78123300e-03
  3.97606228e-03]
 [3.91428241e-03 7.21823394e-04 9.33599821e-01 2.09819663e-02
  4.07821069e-02]]
**************************************************
accuracy performance on test data
0.8930303030303031


**SVM classifier with TfidfVectorizer**

In [20]:
# Create classifier and use tf-idf vectors
SVC_clf = SVC(probability=True)
print('*'*50)
print('SVM classifier with TfidfVectorizer')
print(SVC_clf)

# fit train data
SVC_clf.fit(X_train_tfidf, y_train)

# predict the class and class proba.
predicted = SVC_clf.predict(X_test_tfidf)
predicted_proba = SVC_clf.predict_proba(X_test_tfidf)

print('*'*50)
print('predicted class of first 3 test data')
print(predicted[:3])

print('*'*50)
print('predicted class proba. of first 3 test data')
print(predicted_proba[:3])

np.mean(predicted == y_test)
print('*'*50)
print('accuracy performance on test data')
print(np.mean(predicted == y_test))

**************************************************
SVM classifier with TfidfVectorizer
SVC(probability=True)
**************************************************
predicted class of first 3 test data
[4 2 3]
**************************************************
predicted class proba. of first 3 test data
[[4.07941589e-03 2.00945939e-04 3.37291782e-03 9.86783231e-01
  5.56348941e-03]
 [4.34397575e-02 9.16255665e-01 1.98493494e-02 1.70596329e-02
  3.39559537e-03]
 [7.75485249e-03 2.22760783e-03 9.40030202e-01 1.19049515e-02
  3.80823865e-02]]
**************************************************
accuracy performance on test data
0.9112121212121213


# **作業**

In [24]:
df_test = pd.read_csv("C:/Users/CHARLIE/Desktop/文字探勘/文本分類練習/ptt_data_50k_test.csv")
df_test

Unnamed: 0,id,title
0,0,[閒聊] 西門來演《小美人魚》會怎樣？
1,1,[新聞] 長榮現金減資六成 9/7~16舊股票停買
2,2,[情報] 大國34% off 優惠 日本直送免國際運費
3,3,[新聞] 外資狠降目標價 台積電、聯發科抱頭哭
4,4,[情報] GoShare騎乘金$20
...,...,...
49995,49995,[討論] 巔峰期的Trout跟Pujols兩人你會選誰？
49996,49996,[holo] 3D小劇場 第172話 今天開始蔬菜生活!
49997,49997,[情報] 0527上市外資買賣超排行
49998,49998,[討論] 柏融還有機會把數據刷好看一點嗎


In [25]:
df_test['title_segment'] = [jieba.lcut(sent) for sent in df_test['title']]
df_test['title_segment'] = df_test['title_segment'].apply(lambda x:' '.join(x))
df_test.head(5)

Unnamed: 0,id,title,title_segment
0,0,[閒聊] 西門來演《小美人魚》會怎樣？,[ 閒聊 ] 西門 來演 《 小美人 魚 》 會 怎樣 ？
1,1,[新聞] 長榮現金減資六成 9/7~16舊股票停買,[ 新聞 ] 長 榮現 金 減資 六成 9 / 7 ~ 16 舊 股票 停買
2,2,[情報] 大國34% off 優惠 日本直送免國際運費,[ 情報 ] 大國 34% off 優惠 日本 直送 免國際 運費
3,3,[新聞] 外資狠降目標價 台積電、聯發科抱頭哭,[ 新聞 ] 外資狠 降目 標價 台積電 、 聯發科 抱頭 哭
4,4,[情報] GoShare騎乘金$20,[ 情報 ] GoShare 騎乘金 $ 20


In [27]:
X_test = df_test['title_segment']

In [28]:
# create feature vectors
#count_vect = CountVectorizer(max_features=1000) # max_features=130107
#X_train_counts = count_vect.fit_transform(X_train)

#prints the train data shape
#print('train data shape using CountVectorizer')
#print(X_train_counts.shape)

#prints the test data shape
X_test_counts = count_vect.transform(X_test)
print('test data shape using CountVectorizer')
print(X_test_counts.shape)

test data shape using CountVectorizer
(50000, 1000)


In [29]:
# create feature vectors
#tfidf_vect = TfidfVectorizer(max_features=1000)
#X_train_tfidf = tfidf_vect.fit_transform(X_train)

#prints the train data shape
#print('train data shape using TfidfVectorizer')
#print(X_train_tfidf.shape)

#prints the test data shape
X_test_tfidf = tfidf_vect.transform(X_test)
print('test data shape using TfidfVectorizer')
print(X_test_tfidf.shape)

test data shape using TfidfVectorizer
(50000, 1000)


In [30]:
# Create classifier and use count vectors
SVC_clf = SVC(probability=True)
print('*'*50)
print('SVM classifier with CountVectorizer')
print(SVC_clf)

# fit train data
SVC_clf.fit(X_train_counts, y_train)

# predict the class and class proba.
predicted = SVC_clf.predict(X_test_counts)
predicted_proba = SVC_clf.predict_proba(X_test_counts)

print('*'*50)
print('predicted class of first 3 test data')
print(predicted[:3])

print('*'*50)
print('predicted class proba. of first 3 test data')
print(predicted_proba[:3])

**************************************************
SVM classifier with CountVectorizer
SVC(probability=True)
**************************************************
predicted class of first 3 test data
[5 1 4]
**************************************************
predicted class proba. of first 3 test data
[[1.15042928e-06 3.38764125e-07 8.85663550e-07 4.13160004e-07
  9.99997212e-01]
 [9.69866940e-01 1.76645291e-02 8.85519790e-03 1.18249510e-03
  2.43083743e-03]
 [3.67141587e-03 1.16797037e-04 5.71843439e-03 9.79986758e-01
  1.05065943e-02]]


In [31]:
# Create classifier and use tf-idf vectors
SVC_clf = SVC(probability=True)
print('*'*50)
print('SVM classifier with TfidfVectorizer')
print(SVC_clf)

# fit train data
SVC_clf.fit(X_train_tfidf, y_train)

# predict the class and class proba.
predicted = SVC_clf.predict(X_test_tfidf)
predicted_proba = SVC_clf.predict_proba(X_test_tfidf)

print('*'*50)
print('predicted class of first 3 test data')
print(predicted[:3])

print('*'*50)
print('predicted class proba. of first 3 test data')
print(predicted_proba[:3])

**************************************************
SVM classifier with TfidfVectorizer
SVC(probability=True)
**************************************************
predicted class of first 3 test data
[5 1 4]
**************************************************
predicted class proba. of first 3 test data
[[1.44368060e-06 8.94466878e-07 1.64160215e-06 7.37114392e-07
  9.99995283e-01]
 [9.46039614e-01 3.17201641e-02 1.51580723e-02 2.40796068e-03
  4.67418901e-03]
 [1.43381479e-03 1.28361881e-04 2.06861139e-03 9.89003885e-01
  7.36532714e-03]]


In [34]:
df_submission = pd.read_csv('C:/Users/CHARLIE/Desktop/文字探勘/文本分類練習/ptt_data_50k_test.csv')
df_submission['board'] = predicted
df_submission

Unnamed: 0,id,title,board
0,0,[閒聊] 西門來演《小美人魚》會怎樣？,5
1,1,[新聞] 長榮現金減資六成 9/7~16舊股票停買,1
2,2,[情報] 大國34% off 優惠 日本直送免國際運費,4
3,3,[新聞] 外資狠降目標價 台積電、聯發科抱頭哭,1
4,4,[情報] GoShare騎乘金$20,4
...,...,...,...
49995,49995,[討論] 巔峰期的Trout跟Pujols兩人你會選誰？,3
49996,49996,[holo] 3D小劇場 第172話 今天開始蔬菜生活!,5
49997,49997,[情報] 0527上市外資買賣超排行,1
49998,49998,[討論] 柏融還有機會把數據刷好看一點嗎,3


In [35]:
df_submission['board'] = df_submission['board'].map({1: 'Stock', 2: 'Gossiping', 3: 'Baseball', 4: 'Lifeismoney', 5: 'C_Chat'})
df_submission

Unnamed: 0,id,title,board
0,0,[閒聊] 西門來演《小美人魚》會怎樣？,C_Chat
1,1,[新聞] 長榮現金減資六成 9/7~16舊股票停買,Stock
2,2,[情報] 大國34% off 優惠 日本直送免國際運費,Lifeismoney
3,3,[新聞] 外資狠降目標價 台積電、聯發科抱頭哭,Stock
4,4,[情報] GoShare騎乘金$20,Lifeismoney
...,...,...,...
49995,49995,[討論] 巔峰期的Trout跟Pujols兩人你會選誰？,Baseball
49996,49996,[holo] 3D小劇場 第172話 今天開始蔬菜生活!,C_Chat
49997,49997,[情報] 0527上市外資買賣超排行,Stock
49998,49998,[討論] 柏融還有機會把數據刷好看一點嗎,Baseball


In [38]:
df_submission.to_csv('C:/Users/CHARLIE/Desktop/文字探勘/文本分類練習/ptt_data_50k_submission.csv', index=0)