In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('./data/merged.csv')
df = df[['Singer', 'Name', 'Lyric', 'Category']].astype(str)
df.drop(df[df.Singer == 'nan'].index, inplace=True)
df.drop(df[df.Name == 'Name'].index, inplace=True)
print("\033[32mTotal number of data: %d\033[0m" %len(df))

[32mTotal number of data: 991[0m


In [3]:
d = {'Category': df['Category'].value_counts().index, 'count': df['Category'].value_counts()}
df_cat = pd.DataFrame(data = d).reset_index(drop = True)
df_cat

Unnamed: 0,Category,count
0,愛情,442
1,想法表達,163
2,反抗反駁反諷,87
3,勵志,83
4,悲傷,70
5,嗨歌,63
6,離別,30
7,友情,18
8,特殊主題,17
9,親情,13


In [4]:
df.loc[df.Category == '勵志','Category'] = '勵志友情'
df.loc[df.Category == '友情','Category'] = '勵志友情'
df.loc[df.Category == '悲傷','Category'] = '悲傷離別'
df.loc[df.Category == '離別','Category'] = '悲傷離別'
df

Unnamed: 0,Singer,Name,Lyric,Category
0,A-Lin,One Life,白雲撞進藍天 激起昨天畫面我想起你的臉好久不見 在遠方的My Friend不論晴天雨天 也會...,勵志友情
1,A-Lin,最好的朋友在身邊,一起笑一起哭一起鬧久別的回憶又浮現一點點一天天一年年最好的朋友在身邊一起走一起跑一起跳相聚到...,勵志友情
2,A-Lin,愛的可能,你出現我身邊 像個奇蹟發生沒想到會是你 讓我如此失魂我心中的感覺是這樣陌生快樂的牽掛在相聚的...,勵志友情
3,A-Lin,太太太耐斯,孩子還沒睡 拜託別鬧手遊聲太吵 狗狗在叫碗盤在洗碗槽 主婦的煩惱惱公不瞭 他先洗澡奶瓶把香水...,反抗反駁反諷
4,A-Lin,抱歉 我不抱歉,恨 什麼叫恨 這個單字 好陌生我只記得 一路走來 愛的人就算他們 多可恨等 雖然溫柔 還打不...,反抗反駁反諷
...,...,...,...,...
992,鄧紫棋,回憶的沙漏,拼圖一片片失落　像楓葉的冷漠牆上的鐘　默默數著寂寞咖啡飄散過香味　剩苦澀陪著我想念的心　埋葬...,悲傷離別
993,鄧紫棋,多遠都要在一起,想聽你聽過的音樂 想看你看過的小說我想收集每一刻 我想看到你眼裡的世界想到你到過的地方 和你...,悲傷離別
994,鄧紫棋,兩個自己,像 從不認識你像全部的回憶 都已被你拋棄無法看透你直覺你心裡有 太多的秘密被你蒙住了眼...,悲傷離別
995,鄧紫棋,兩個你(粵),像 未曾認識你像全部憶記 你早拋棄太異樣 想要望穿你直覺這刻你有 太多可揭秘這場遊戲...,悲傷離別


In [5]:
d = {'Category': df['Category'].value_counts().index, 'count': df['Category'].value_counts()}
df_cat = pd.DataFrame(data = d).reset_index(drop = True)
df_cat

Unnamed: 0,Category,count
0,愛情,442
1,想法表達,163
2,勵志友情,101
3,悲傷離別,100
4,反抗反駁反諷,87
5,嗨歌,63
6,特殊主題,17
7,親情,13
8,感謝,5


In [7]:
condition = df['Category'] == '愛情'
df_select = df[condition].sample(100)
condition = df['Category'] == '想法表達'
df_tmp = df[condition].sample(100)
df_select = pd.concat([df_select, df_tmp])
condition = df['Category'] == '勵志友情'
df_tmp = df[condition].sample(100)
df_select = pd.concat([df_select, df_tmp])
condition = df['Category'] == '悲傷離別'
df_tmp = df[condition].sample(100)
df_select = pd.concat([df_select, df_tmp])
condition = df['Category'] == '反抗反駁反諷'
df_tmp = df[condition].sample(87)
df_select = pd.concat([df_select, df_tmp])

df_select = df_select.reset_index(drop=True)

In [8]:
d = {'Category': df_select['Category'].value_counts().index, 'count': df_select['Category'].value_counts()}
df_cat = pd.DataFrame(data = d).reset_index(drop = True)
df_cat

Unnamed: 0,Category,count
0,愛情,100
1,想法表達,100
2,勵志友情,100
3,悲傷離別,100
4,反抗反駁反諷,87


In [9]:
list(df_select['Category'].value_counts().index)

['愛情', '想法表達', '勵志友情', '悲傷離別', '反抗反駁反諷']

In [10]:
def remove_punctuation(line):
    line = str(line)
    if line.strip() == '':
        return ''
    re_han = re.compile(u"[^a-zA-Z0-9\u4E00-\u9FA5]")
    line = re_han.sub('', line)
    return line

In [11]:
import re
import jieba
stopwords = [line.strip() for line in open('./data/stopwords.txt', 'r', encoding = 'utf-8').readlines()]
df_select['clean_text'] = df_select['Text'].apply(remove_punctuation)
df_select['cut_text'] = df_select['clean_text'].apply(lambda x: [w for w in list(jieba.cut(x)) if (w not in stopwords and w != ' ')])
df_select['cut_text_string'] = [' '.join(map(str, l)) for l in df_select['cut_text']]
df_select

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.926 seconds.
Prefix dict has been built successfully.


Unnamed: 0,Text,Label,clean_text,cut_text,cut_text_string
0,我總該想個辦法，如果哪天他不在了我要如何讓我的負能量有個發洩的出口,正向,我總該想個辦法如果哪天他不在了我要如何讓我的負能量有個發洩的出口,"[總該, 想個, 辦法, 我的負, 能量, 有個, 發洩, 出口]",總該 想個 辦法 我的負 能量 有個 發洩 出口
1,我以為我會死 但我真的捨不得我愛的人難過 我好害怕看到他們失望的眼神,正向,我以為我會死但我真的捨不得我愛的人難過我好害怕看到他們失望的眼神,"[死, 真的, 捨, 我愛的, 人, 難過, 好, 害怕, 看到, 失望, 眼神]",死 真的 捨 我愛的 人 難過 好 害怕 看到 失望 眼神
2,此時此刻 我只想有人聽聽我說的話 我還不會死,正向,此時此刻我只想有人聽聽我說的話我還不會死,"[此時, 此刻, 只, 想, 有人, 聽, 聽, 說, 話, 還不會, 死]",此時 此刻 只 想 有人 聽 聽 說 話 還不會 死
3,和她講惡夢的那天，我在她懷裡猶豫了很久才開口，以前一直認為我可以很理智的講出這段故事，但真正...,正向,和她講惡夢的那天我在她懷裡猶豫了很久才開口以前一直認為我可以很理智的講出這段故事但真正講出來...,"[講惡, 夢, 那天, 懷裡, 猶豫, 久, 開口, 以前, 一直, 認為, 理智, 講出,...",講惡 夢 那天 懷裡 猶豫 久 開口 以前 一直 認為 理智 講出 這段 故事 真正 講出 ...
4,為什麼他們可以撐得住 我卻不行呢 所以你還可以撐下去的 只要再撐一下下就可以了 再一下下,正向,為什麼他們可以撐得住我卻不行呢所以你還可以撐下去的只要再撐一下下就可以了再一下下,"[撐, 住, 卻, 不行, 你還, 撐, 下去, 再撐, 一下下, 再, 一下下]",撐 住 卻 不行 你還 撐 下去 再撐 一下下 再 一下下
...,...,...,...,...,...
395,我不確定這是不是大家說的情緒障礙還是躁鬱症，但我真的覺得很無助……,非正向,我不確定這是不是大家說的情緒障礙還是躁鬱症但我真的覺得很無助,"[確定, 是不是, 說, 情緒, 障礙, 躁鬱症, 真的, 覺得, 無助]",確定 是不是 說 情緒 障礙 躁鬱症 真的 覺得 無助
396,她就直接衝過來抓住我的手臂 開始拉扯我頭髮 用力的打我的頭 接著她破口大罵 罵了一堆難聽到不行的話,非正向,她就直接衝過來抓住我的手臂開始拉扯我頭髮用力的打我的頭接著她破口大罵罵了一堆難聽到不行的話,"[直接, 衝過, 抓住, 手臂, 拉扯, 我頭, 髮, 用力, 頭接, 破口, 大罵, 罵,...",直接 衝過 抓住 手臂 拉扯 我頭 髮 用力 頭接 破口 大罵 罵 一堆 難 聽 不行 話
397,對很多人來說我可能是無病呻吟，但是我真的覺得生活過的很累，但我又不知道該怎麼改善,非正向,對很多人來說我可能是無病呻吟但是我真的覺得生活過的很累但我又不知道該怎麼改善,"[人來, 說, 無病, 呻吟, 真的, 覺得, 生活, 過的, 累, 知道, 改善]",人來 說 無病 呻吟 真的 覺得 生活 過的 累 知道 改善
398,我身邊一個深受性暴力之苦的朋友上吊走了。 性是真的可以殺人的。我身邊好幾個受過性暴力傷害的人...,非正向,我身邊一個深受性暴力之苦的朋友上吊走了性是真的可以殺人的我身邊好幾個受過性暴力傷害的人都很奮...,"[身邊, 一個, 深受, 性暴力, 之苦, 朋友, 上吊, 走, 性, 真的, 殺人, 身邊...",身邊 一個 深受 性暴力 之苦 朋友 上吊 走 性 真的 殺人 身邊 好 幾個 受過 性暴力...


In [12]:
test = df_select.drop(['clean_text', 'cut_text_string'], axis = 1)
test.head(10)

Unnamed: 0,Text,Label,cut_text
0,我總該想個辦法，如果哪天他不在了我要如何讓我的負能量有個發洩的出口,正向,"[總該, 想個, 辦法, 我的負, 能量, 有個, 發洩, 出口]"
1,我以為我會死 但我真的捨不得我愛的人難過 我好害怕看到他們失望的眼神,正向,"[死, 真的, 捨, 我愛的, 人, 難過, 好, 害怕, 看到, 失望, 眼神]"
2,此時此刻 我只想有人聽聽我說的話 我還不會死,正向,"[此時, 此刻, 只, 想, 有人, 聽, 聽, 說, 話, 還不會, 死]"
3,和她講惡夢的那天，我在她懷裡猶豫了很久才開口，以前一直認為我可以很理智的講出這段故事，但真正...,正向,"[講惡, 夢, 那天, 懷裡, 猶豫, 久, 開口, 以前, 一直, 認為, 理智, 講出,..."
4,為什麼他們可以撐得住 我卻不行呢 所以你還可以撐下去的 只要再撐一下下就可以了 再一下下,正向,"[撐, 住, 卻, 不行, 你還, 撐, 下去, 再撐, 一下下, 再, 一下下]"
5,謝謝所有溫暖的留言，覺得心裡暖暖的。還好，原來真的不是我想的太嚴重(哭,正向,"[謝謝, 所有, 溫暖, 留言, 覺得, 心裡, 暖暖的, 還好, 原來, 真的, 想, 太..."
6,現在的我 過的很快樂 很多事情我也不在繞進死胡同 曾經我也是被憂鬱情緒綁架的人 但現在 ...,正向,"[現在, 我過, 很快, 樂, 事情, 繞, 進, 死胡同, 曾經, 憂鬱, 情緒, 綁, ..."
7,有時也是會很享受一個人的時候,正向,"[有時, 享受, 一個, 人]"
8,如果有天我不幸的去世了，請記得我會永遠用我的方式陪在妳身邊，我會永遠支持妳。,正向,"[有天, 不幸, 去世, 請, 記得, 我會, 永遠用, 方式, 陪, 妳, 身邊, 我會永..."
9,"搬房間讓我覺得很興奮,心情比較好.有事做讓我不會想太多,也比較舒服,也心情比較好.換個房間,...",正向,"[搬, 房間, 覺得, 興奮, 心情, 比較, 好, 有事, 做, 想, 太, 舒服, 心情..."


In [13]:
df_select = df_select.sample(frac = 1).reset_index(drop=True)
df_select

Unnamed: 0,Text,Label,clean_text,cut_text,cut_text_string
0,拜託痊癒 我跟上帝祈禱最後一次,正向,拜託痊癒我跟上帝祈禱最後一次,"[拜, 託, 痊, 癒, 上帝, 祈禱, 最, 後, 一次]",拜 託 痊 癒 上帝 祈禱 最 後 一次
1,我先看了心理諮商，每次去就好像是在聊天一樣，我沒有覺得我變好了，但至少我不那麼糟了,正向,我先看了心理諮商每次去就好像是在聊天一樣我沒有覺得我變好了但至少我不那麼糟了,"[先, 心理, 諮, 商, 每次, 好像, 聊天, 覺得, 變好, 至少, 糟]",先 心理 諮 商 每次 好像 聊天 覺得 變好 至少 糟
2,第一次執行殺了自己這個計畫,非正向,第一次執行殺了自己這個計畫,"[第一次, 執行, 殺, 計畫]",第一次 執行 殺 計畫
3,我從來沒有冒犯過妳，為什麼要這樣攻擊我？ 明明你一點也不了解我、卻愛跟我裝熟，現在還背著我捅...,非正向,我從來沒有冒犯過妳為什麼要這樣攻擊我明明你一點也不了解我卻愛跟我裝熟現在還背著我捅了一刀噁心...,"[冒犯, 妳, 攻擊, 明明, 一點, 了解, 卻, 愛跟, 我裝, 熟現, 還背, 捅, ...",冒犯 妳 攻擊 明明 一點 了解 卻 愛跟 我裝 熟現 還背 捅 一刀 噁 心 真的 太 噁 心
4,當然，我沒準時看醫生，沒好好吃藥 因為我已經自我放棄了⋯。,非正向,當然我沒準時看醫生沒好好吃藥因為我已經自我放棄了,"[沒準, 時, 醫生, 沒好, 好吃, 藥因, 已經, 自我, 放棄]",沒準 時 醫生 沒好 好吃 藥因 已經 自我 放棄
...,...,...,...,...,...
395,爸 我現在還沒有辦法說愛你 但我很珍惜你在家裡照顧我還有在外打拼的每一天,正向,爸我現在還沒有辦法說愛你但我很珍惜你在家裡照顧我還有在外打拼的每一天,"[爸, 我現, 還沒有, 辦法, 說, 愛, 珍惜, 在家, 裡照, 顧, 在外, 打拼, 一天]",爸 我現 還沒有 辦法 說 愛 珍惜 在家 裡照 顧 在外 打拼 一天
396,有幾天我的狀況有好一些，我開始會餓了，也會笑了 也不怕人群和密閉空間了,正向,有幾天我的狀況有好一些我開始會餓了也會笑了也不怕人群和密閉空間了,"[幾天, 狀況, 好, 一些, 餓, 笑, 人群, 密閉, 空間]",幾天 狀況 好 一些 餓 笑 人群 密閉 空間
397,然後他就會莫名其妙的罵我說 ：「安ㄋㄨㄚ！緊嘛是安抓？民西累耖三小？」 （怎樣？現在是怎樣？...,非正向,然後他就會莫名其妙的罵我說安緊嘛是安抓民西累耖三小怎樣現在是怎樣臉是在臭三小邊罵我邊大力的捏...,"[然後他, 莫名其妙, 罵, 說安緊, 安, 抓民西累, 耖, 小, 現在, 臉, 三小邊,...",然後他 莫名其妙 罵 說安緊 安 抓民西累 耖 小 現在 臉 三小邊 罵 我邊 大力 捏 臉 罵
398,失眠到了某個晚上，我突然開始拔頭髮,非正向,失眠到了某個晚上我突然開始拔頭髮,"[失眠, 晚上, 突然, 拔頭, 髮]",失眠 晚上 突然 拔頭 髮


# Build Model

In [14]:
# 計算詞頻
count_vect = CountVectorizer(stop_words='english', decode_error='ignore')
X_train_counts = count_vect.fit_transform(df_select.cut_text_string)
# 計算TF-IDF
tf_transformer = TfidfVectorizer(stop_words='english', decode_error='ignore')
X_train_counts_tf = tf_transformer.fit_transform(df_select.cut_text_string)
labels = df_select.Label

In [15]:
terms = tf_transformer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X_train_counts_tf.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])



In [16]:
test = ranking.sort_values('rank', ascending = False)
test.tail(20)

Unnamed: 0,term,rank
171,但太快,0.173423
1579,過大,0.173423
791,打擊會,0.173423
855,收進,0.173095
1507,超有,0.173095
1604,還有頭,0.173095
627,很長,0.173095
779,手指,0.173095
1448,角落,0.173095
352,吃醋,0.173095


In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train_counts_tf, labels, test_size = 0.2, random_state = 0)

## Discriminative model

In [18]:
num_folds = 10
scoring = 'accuracy'

In [19]:
'''models = {}
models['LR'] = LogisticRegression() #逻辑回归
models['SVM'] = SVC() #支持向量机
models['CART'] = DecisionTreeClassifier() #分类与回归树
models['MNB'] = MultinomialNB() #朴素贝叶斯分类器
models['KNN'] = KNeighborsClassifier() #K近邻算法
results = []

for key in models:
    kfold = KFold(n_splits = num_folds)
    cv_results = cross_val_score(models[key], x_train, y_train, cv = kfold, scoring = scoring)
    results.append(cv_results)
    print('%s:%f(%f)' %(key,cv_results.mean(),cv_results.std()))'''

"models = {}\nmodels['LR'] = LogisticRegression() #逻辑回归\nmodels['SVM'] = SVC() #支持向量机\nmodels['CART'] = DecisionTreeClassifier() #分类与回归树\nmodels['MNB'] = MultinomialNB() #朴素贝叶斯分类器\nmodels['KNN'] = KNeighborsClassifier() #K近邻算法\nresults = []\n\nfor key in models:\n    kfold = KFold(n_splits = num_folds)\n    cv_results = cross_val_score(models[key], x_train, y_train, cv = kfold, scoring = scoring)\n    results.append(cv_results)\n    print('%s:%f(%f)' %(key,cv_results.mean(),cv_results.std()))"

In [20]:
'''# Logistic Regression
param_grid = {}
param_grid['C'] = list(range(0, 20, 2))
model = LogisticRegression()
kfold = KFold(n_splits = num_folds)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = x_train, y = y_train)
print('best acc: %s with %s' % (grid_result.best_score_, grid_result.best_params_))'''

"# Logistic Regression\nparam_grid = {}\nparam_grid['C'] = list(range(0, 20, 2))\nmodel = LogisticRegression()\nkfold = KFold(n_splits = num_folds)\ngrid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)\ngrid_result = grid.fit(X = x_train, y = y_train)\nprint('best acc: %s with %s' % (grid_result.best_score_, grid_result.best_params_))"

In [21]:
'''# SVM
param_grid = {}
param_grid['C'] = list(range(0, 20, 2))
model = SVC()
kfold = KFold(n_splits = num_folds)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = x_train, y = y_train)
print('best acc: %s with %s' % (grid_result.best_score_, grid_result.best_params_))'''

"# SVM\nparam_grid = {}\nparam_grid['C'] = list(range(0, 20, 2))\nmodel = SVC()\nkfold = KFold(n_splits = num_folds)\ngrid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)\ngrid_result = grid.fit(X = x_train, y = y_train)\nprint('best acc: %s with %s' % (grid_result.best_score_, grid_result.best_params_))"

In [22]:
'''# Naive Bayes
param_grid = {}
param_grid['alpha'] = [0.001, 0.01, 0.1, 1.0, 1.5, 2.0]
model = MultinomialNB()
kfold = KFold(n_splits = num_folds)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = x_train, y = y_train)
print('best acc : %s with %s' % (grid_result.best_score_, grid_result.best_params_))'''

"# Naive Bayes\nparam_grid = {}\nparam_grid['alpha'] = [0.001, 0.01, 0.1, 1.0, 1.5, 2.0]\nmodel = MultinomialNB()\nkfold = KFold(n_splits = num_folds)\ngrid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)\ngrid_result = grid.fit(X = x_train, y = y_train)\nprint('best acc : %s with %s' % (grid_result.best_score_, grid_result.best_params_))"

## Ensemble Learning

In [23]:
'''ensembles = {}
ensembles['RF'] = RandomForestClassifier()   # random forest
ensembles['AB'] = AdaBoostClassifier()       # Adaboost
results = []
for key in ensembles:
    kfold = KFold(n_splits = num_folds)
    cv_results = cross_val_score(ensembles[key], x_train, y_train, cv = kfold, scoring = scoring)
    results.append(cv_results)
    print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))'''

"ensembles = {}\nensembles['RF'] = RandomForestClassifier()   # random forest\nensembles['AB'] = AdaBoostClassifier()       # Adaboost\nresults = []\nfor key in ensembles:\n    kfold = KFold(n_splits = num_folds)\n    cv_results = cross_val_score(ensembles[key], x_train, y_train, cv = kfold, scoring = scoring)\n    results.append(cv_results)\n    print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))"

In [24]:
'''# Random Forest
param_grid = {}
param_grid['n_estimators'] = list(range(20, 200, 20))
param_grid['max_depth'] = list(range(10, 30, 5))
model = RandomForestClassifier()
kfold = KFold(n_splits = num_folds)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = x_train, y = y_train)
print('best acc : %s with %s' % (grid_result.best_score_, grid_result.best_params_))'''

"# Random Forest\nparam_grid = {}\nparam_grid['n_estimators'] = list(range(20, 200, 20))\nparam_grid['max_depth'] = list(range(10, 30, 5))\nmodel = RandomForestClassifier()\nkfold = KFold(n_splits = num_folds)\ngrid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)\ngrid_result = grid.fit(X = x_train, y = y_train)\nprint('best acc : %s with %s' % (grid_result.best_score_, grid_result.best_params_))"

In [25]:
'''# AdaBoost
param_grid = {}
param_grid['n_estimators'] = list(range(10, 150, 20))
param_grid['learning_rate'] = [0.001, 0.01, 0.1, 0.5, 1]
model = AdaBoostClassifier()
kfold = KFold(n_splits = num_folds)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X = x_train, y = y_train)
print('best acc : %s with %s' % (grid_result.best_score_, grid_result.best_params_))'''

"# AdaBoost\nparam_grid = {}\nparam_grid['n_estimators'] = list(range(10, 150, 20))\nparam_grid['learning_rate'] = [0.001, 0.01, 0.1, 0.5, 1]\nmodel = AdaBoostClassifier()\nkfold = KFold(n_splits = num_folds)\ngrid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)\ngrid_result = grid.fit(X = x_train, y = y_train)\nprint('best acc : %s with %s' % (grid_result.best_score_, grid_result.best_params_))"

# Test the Final Model

In [26]:
model = LogisticRegression(C = 4)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(accuracy_score(list(y_test), list(predictions)))
print(classification_report(list(y_test), list(predictions)))

0.75
              precision    recall  f1-score   support

          正向       0.00      0.00      0.00        20
         非正向       0.75      1.00      0.86        60

    accuracy                           0.75        80
   macro avg       0.38      0.50      0.43        80
weighted avg       0.56      0.75      0.64        80



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
model = SVC(C = 4)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(accuracy_score(list(y_test), list(predictions)))
print(classification_report(list(y_test), list(predictions)))

0.75
              precision    recall  f1-score   support

          正向       0.00      0.00      0.00        20
         非正向       0.75      1.00      0.86        60

    accuracy                           0.75        80
   macro avg       0.38      0.50      0.43        80
weighted avg       0.56      0.75      0.64        80



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
model = MultinomialNB(alpha = 1.5)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(accuracy_score(list(y_test), list(predictions)))
print(classification_report(list(y_test), list(predictions)))

0.75
              precision    recall  f1-score   support

          正向       0.00      0.00      0.00        20
         非正向       0.75      1.00      0.86        60

    accuracy                           0.75        80
   macro avg       0.38      0.50      0.43        80
weighted avg       0.56      0.75      0.64        80



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
model = RandomForestClassifier(max_depth = 25, n_estimators = 80)
# model = RandomForestClassifier()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(accuracy_score(list(y_test), list(predictions)))
print(classification_report(list(y_test), list(predictions)))

0.75
              precision    recall  f1-score   support

          正向       0.00      0.00      0.00        20
         非正向       0.75      1.00      0.86        60

    accuracy                           0.75        80
   macro avg       0.38      0.50      0.43        80
weighted avg       0.56      0.75      0.64        80



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
model = AdaBoostClassifier(learning_rate = 0.5, n_estimators = 90)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(accuracy_score(list(y_test), list(predictions)))
print(classification_report(list(y_test), list(predictions)))

0.775
              precision    recall  f1-score   support

          正向       0.60      0.30      0.40        20
         非正向       0.80      0.93      0.86        60

    accuracy                           0.78        80
   macro avg       0.70      0.62      0.63        80
weighted avg       0.75      0.78      0.75        80



In [31]:
y_pred = list(predictions)
y_test_t = list(y_test)

In [32]:
sent, label, predict = [], [], []
for i in range(len(y_pred)):
    if y_pred[i] != y_test_t[i]:
        sent.append(x_test[i])
        label.append(y_test_t[i])
        predict.append(y_pred[i])