In [14]:
import os
import zipfile
import pandas as pd
import numpy as np
from cassis import load_typesystem, load_cas_from_xmi
import jieba
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_curve, auc, classification_report,accuracy_score,precision_score,f1_score,recall_score,cohen_kappa_score,multilabel_confusion_matrix,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score,cross_validate


In [2]:
# load cas
def load_cas(text_directory):
    with open(text_directory+'/TypeSystem.xml', 'rb') as f:
        typesystem = load_typesystem(f)
    with open(text_directory+'/zhengwu.xmi', 'rb') as f:
        cas = load_cas_from_xmi(f, typesystem=typesystem)
    return cas

In [3]:
dataframe = pd.read_csv(r'C:\Users\dawns\Desktop\Master thesis\data\With connective error.csv')


In [4]:
dataframe

Unnamed: 0.1,Unnamed: 0,Title
0,0,GreenFood_1.txt
1,1,GreenFood_100.txt
2,2,GreenFood_104.txt
3,3,GreenFood_110.txt
4,4,GreenFood_114.txt
...,...,...
1625,1625,Multi_981.txt
1626,1626,Multi_99.txt
1627,1627,Multi_994.txt
1628,1628,Multi_996.txt


In [5]:

train_list = dataframe['Title'].tolist()
directory = r'C:\Users\dawns\Desktop\annotation\annotation'

In [6]:

sentence_list=[]
location_list=[]
type_list=[]
error_list=[]
for text in train_list:
    text_dir = directory + '/' + text
    cas = load_cas(text_dir)
    for sentence in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'):
        sentence_list.append(sentence.get_covered_text())
        location_list.append(sentence)
    #print(text)

In [7]:
#print(sentence_list)
df2=pd.DataFrame()
df2['sentence']= pd.DataFrame(sentence_list)
#print(location_list)
df2['location_list']= pd.DataFrame(location_list)


In [8]:
df2

Unnamed: 0,sentence,location_list
0,[BD《]绿色绿色与饥饿[BD》]如果要我在绿色食品与饥饿{CQ中}做出选择的话，我会毫不犹...,"d.t.u.d.c.a.s.t.Sentence(begin=0, end=98)"
1,是的[BQ，]虽然科技在发展，人们的需求也越来越多，但是我认为，与其将时间浪费在减少生产量来...,"d.t.u.d.c.a.s.t.Sentence(begin=98, end=178)"
2,“少而精”，这正是绿色食品的特点，也是那些“吃饱撑着”的人的观点。,"d.t.u.d.c.a.s.t.Sentence(begin=178, end=211)"
3,现在普遍饮用的并不定绿色食品，而是短文里所说的所谓“吃了会有害于人体健康”的食品。,"d.t.u.d.c.a.s.t.Sentence(begin=211, end=252)"
4,但是，究竟有多少人是因为吃了这些食品而死亡（生病）的呢？,"d.t.u.d.c.a.s.t.Sentence(begin=252, end=280)"
...,...,...
25957,对于我来说[F説][BQ，]学习汉语的苦[B告]是经历人为的障碍[F礙]，而不像其他的外国人...,"d.t.u.d.c.a.s.t.Sentence(begin=455, end=531)"
25958,学习汉语的乐处可真多。,"d.t.u.d.c.a.s.t.Sentence(begin=531, end=542)"
25959,在看国语电影时[F時]，能了解电影剧情和对白而不需靠字幕和翻译。,"d.t.u.d.c.a.s.t.Sentence(begin=542, end=574)"
25960,和别人以汉语来交谈的亲切感等等。,"d.t.u.d.c.a.s.t.Sentence(begin=574, end=590)"


In [9]:

sentence_list=[]
location_list=[]
type_list=[]
error_list=[]
for text in train_list:
    text_dir = directory + '/' + text
    cas = load_cas(text_dir)
    for sentence in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'):
        error = cas.select_covered('webanno.custom.Error', sentence)
        if(len(error)>0):
            type_list.append('1')
            error_list.append(error[0].get_covered_text())
        else:
            type_list.append('0')
            error_list.append('0')


In [10]:
type_list
df2['type_list']= pd.DataFrame(type_list)
error_list
df2['error_list']= pd.DataFrame(error_list)

In [11]:
df2

Unnamed: 0,sentence,location_list,type_list,error_list
0,[BD《]绿色绿色与饥饿[BD》]如果要我在绿色食品与饥饿{CQ中}做出选择的话，我会毫不犹...,"d.t.u.d.c.a.s.t.Sentence(begin=0, end=98)",0,0
1,是的[BQ，]虽然科技在发展，人们的需求也越来越多，但是我认为，与其将时间浪费在减少生产量来...,"d.t.u.d.c.a.s.t.Sentence(begin=98, end=178)",0,0
2,“少而精”，这正是绿色食品的特点，也是那些“吃饱撑着”的人的观点。,"d.t.u.d.c.a.s.t.Sentence(begin=178, end=211)",0,0
3,现在普遍饮用的并不定绿色食品，而是短文里所说的所谓“吃了会有害于人体健康”的食品。,"d.t.u.d.c.a.s.t.Sentence(begin=211, end=252)",0,0
4,但是，究竟有多少人是因为吃了这些食品而死亡（生病）的呢？,"d.t.u.d.c.a.s.t.Sentence(begin=252, end=280)",0,0
...,...,...,...,...
25957,对于我来说[F説][BQ，]学习汉语的苦[B告]是经历人为的障碍[F礙]，而不像其他的外国人...,"d.t.u.d.c.a.s.t.Sentence(begin=455, end=531)",0,0
25958,学习汉语的乐处可真多。,"d.t.u.d.c.a.s.t.Sentence(begin=531, end=542)",0,0
25959,在看国语电影时[F時]，能了解电影剧情和对白而不需靠字幕和翻译。,"d.t.u.d.c.a.s.t.Sentence(begin=542, end=574)",0,0
25960,和别人以汉语来交谈的亲切感等等。,"d.t.u.d.c.a.s.t.Sentence(begin=574, end=590)",0,0


In [12]:
df2.to_csv(r'C:\Users\dawns\Desktop\Master thesis\data\Error annotation dataset.csv')

In [17]:
# if we already build the dataset, then we can start here for loading the dataset directly
df2 = pd.read_csv(r'C:\Users\dawns\Desktop\Master thesis\data\Error annotation dataset.csv')


In [18]:
count = df2.loc[:, 'type_list'].value_counts()
count
print(count)

0    23537
1     2425
Name: type_list, dtype: int64


In [9]:
def preprocessing_sentence(x):
    words = jieba.cut(str(x).strip())
    return ' '.join(words)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(df2['sentence'], df2['type_list'], test_size=0.2)

x_train = x_train.apply(lambda x: preprocessing_sentence(x))
x_test = x_test.apply(lambda x: preprocessing_sentence(x))

#TF-IDF
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train).toarray()
x_test = tf.transform(x_test).toarray()

In [20]:
x_train.shape

(20769, 17425)

In [21]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
train_score = lr.score(x_train, y_train)
y_pred = lr.predict(x_test)
print("Train Accuracy:", train_score)
print("================================================")
y_pred = lr.predict(x_test)
test_score = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_score)
print("================================================")
acc_for_each_class = metrics.precision_score(y_test,y_pred,average=None)
print("acc_for_each_class:\n",acc_for_each_class)
print("================================================")
qwkappa = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("qwkappa:\n",qwkappa)
print("================================================")
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")
classification_rep = classification_report(y_test,y_pred)
print("classification report: \n",classification_rep)
print("================================================")
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
print(confusion_mat)


Train Accuracy: 0.9158361018826136
Test Accuracy: 0.9127671865973426
acc_for_each_class:
 [0.92145254 0.56349206]
qwkappa:
 0.20837535548468578
Weighted precision 0.8891237353983864
Weighted recall 0.9127671865973426
Weighted f1-score 0.889151588448688
classification report: 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95      4724
           1       0.56      0.15      0.24       469

    accuracy                           0.91      5193
   macro avg       0.74      0.57      0.60      5193
weighted avg       0.89      0.91      0.89      5193

[[4669   55]
 [ 398   71]]


In [23]:
scoring = {'recall': 'recall','qwkappa': make_scorer(cohen_kappa_score,weights='quadratic')}
scores = cross_validate(lr, x_train, y_train, cv=5, scoring=scoring)#5-fold cv
recall = scores['test_recall']
qwkappa = scores['test_qwkappa']
print(scores)
print('Mean recall:',recall.mean())
print('Mean qwkappa:',qwkappa.mean())

{'fit_time': array([14.13101983, 14.78310657, 15.27889919, 13.91827226, 13.839077  ]), 'score_time': array([0.1319592 , 0.11776042, 0.11514306, 0.1160512 , 0.12700176]), 'test_recall': array([0.10230179, 0.1202046 , 0.10230179, 0.10714286, 0.10741688]), 'test_qwkappa': array([0.13524033, 0.16815066, 0.14921618, 0.15773269, 0.15119701])}
Mean recall: 0.1078735842162952
Mean qwkappa: 0.15230737362648566


In [19]:
rf = ensemble.RandomForestRegressor()
params = {'n_estimators': [10, 50, 100], 'max_depth':[10, 50, 100], 'max_features':[2, 5, 10]}
grid = GridSearchCV(estimator=rf, param_grid=params)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

0.04243176737848662
RandomForestRegressor(max_depth=100, max_features=10)
{'max_depth': 100, 'max_features': 10, 'n_estimators': 100}


In [20]:
rf = ensemble.RandomForestClassifier()
params = {'n_estimators': [100], 'max_depth':[10], 'max_features':[100]}
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
train_score = rf.score(x_train, y_train)
test_score = accuracy_score(y_test, y_pred)
print("Train Accuracy:", train_score)
print("================================================")
print("Test Accuracy:", test_score)
print("================================================")
acc_for_each_class = metrics.precision_score(y_test,y_pred,average=None)
print("acc_for_each_class:\n",acc_for_each_class)
print("================================================")
qwkappa = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("qwkappa:\n",qwkappa)
print("================================================")
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")
classification_rep = classification_report(y_test,y_pred)
print("classification report: \n",classification_rep)
print("================================================")
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
print(confusion_mat)

Train Accuracy: 0.9995185131686648
Test Accuracy: 0.9100712497592913
acc_for_each_class:
 [0.91007542 0.90909091]
qwkappa:
 0.07136864443611368
Weighted precision 0.909983472205731
Weighted recall 0.9100712497592913
Weighted f1-score 0.8711164578741785
classification report: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95      4708
           1       0.91      0.04      0.08       485

    accuracy                           0.91      5193
   macro avg       0.91      0.52      0.52      5193
weighted avg       0.91      0.91      0.87      5193

[[4706    2]
 [ 465   20]]


In [21]:
df2

Unnamed: 0.1,Unnamed: 0,sentence,location_list,type_list,error_list
0,0,[BD《]绿色绿色与饥饿[BD》]如果要我在绿色食品与饥饿{CQ中}做出选择的话，我会毫不犹...,"d.t.u.d.c.a.s.t.Sentence(begin=0, end=98)",0,0
1,1,是的[BQ，]虽然科技在发展，人们的需求也越来越多，但是我认为，与其将时间浪费在减少生产量来...,"d.t.u.d.c.a.s.t.Sentence(begin=98, end=178)",0,0
2,2,“少而精”，这正是绿色食品的特点，也是那些“吃饱撑着”的人的观点。,"d.t.u.d.c.a.s.t.Sentence(begin=178, end=211)",0,0
3,3,现在普遍饮用的并不定绿色食品，而是短文里所说的所谓“吃了会有害于人体健康”的食品。,"d.t.u.d.c.a.s.t.Sentence(begin=211, end=252)",0,0
4,4,但是，究竟有多少人是因为吃了这些食品而死亡（生病）的呢？,"d.t.u.d.c.a.s.t.Sentence(begin=252, end=280)",0,0
...,...,...,...,...,...
25957,25957,对于我来说[F説][BQ，]学习汉语的苦[B告]是经历人为的障碍[F礙]，而不像其他的外国人...,"d.t.u.d.c.a.s.t.Sentence(begin=455, end=531)",0,0
25958,25958,学习汉语的乐处可真多。,"d.t.u.d.c.a.s.t.Sentence(begin=531, end=542)",0,0
25959,25959,在看国语电影时[F時]，能了解电影剧情和对白而不需靠字幕和翻译。,"d.t.u.d.c.a.s.t.Sentence(begin=542, end=574)",0,0
25960,25960,和别人以汉语来交谈的亲切感等等。,"d.t.u.d.c.a.s.t.Sentence(begin=574, end=590)",0,0


In [48]:
# word cut
import re
import string
df2 = pd.read_csv(r'C:\Users\dawns\Desktop\Master thesis\data\Error annotation dataset.csv')

comp = re.compile('[^A-Z^a-z^0-9^ ]')
regex = re.compile('[%s]' % re.escape(string.punctuation+'”“’�—…°'))
pc = r"\d+\.?\d*"

df2['WORDCUT'] = df2['sentence']
#去除字母数字表情和其它字符
def clear_character(sentence):
    pattern1='[a-zA-Z0-9]'
    pattern2 = '\[.*?\]'
    pattern3 = re.compile(u'[^\s1234567890:：' + '\u4e00-\u9fa5]+')
    pattern4='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
    line1=re.sub(pattern1,'',sentence)   #去除英文字母和数字
    line2=re.sub(pattern2,'',line1)   #去除表情
    line3=re.sub(pattern3,'',line2)   #去除其它字符
    line4=re.sub(pattern4, '', line3) #去掉残留的冒号及其它符号
    new_sentence=''.join(line4.split()) #去除空白
    return new_sentence
    #return ' '.join(cw)
df2['WORDCUT'] = df2['WORDCUT'].apply(clear_character)
df2

Unnamed: 0.1,Unnamed: 0,sentence,location_list,type_list,error_list,WORDCUT
0,0,[BD《]绿色绿色与饥饿[BD》]如果要我在绿色食品与饥饿{CQ中}做出选择的话，我会毫不犹...,"d.t.u.d.c.a.s.t.Sentence(begin=0, end=98)",0,0,绿色绿色与饥饿如果要我在绿色食品与饥饿中做出选择的话我会毫不犹豫地的选择饥饿大家也应该在新闻...
1,1,是的[BQ，]虽然科技在发展，人们的需求也越来越多，但是我认为，与其将时间浪费在减少生产量来...,"d.t.u.d.c.a.s.t.Sentence(begin=98, end=178)",0,0,是的虽然科技在发展人们的需求也越来越多但是我认为与其将时间浪费在减少生产量来制造绿色食品还不...
2,2,“少而精”，这正是绿色食品的特点，也是那些“吃饱撑着”的人的观点。,"d.t.u.d.c.a.s.t.Sentence(begin=178, end=211)",0,0,少而精这正是绿色食品的特点也是那些吃饱撑着的人的观点
3,3,现在普遍饮用的并不定绿色食品，而是短文里所说的所谓“吃了会有害于人体健康”的食品。,"d.t.u.d.c.a.s.t.Sentence(begin=211, end=252)",0,0,现在普遍饮用的并不定绿色食品而是短文里所说的所谓吃了会有害于人体健康的食品
4,4,但是，究竟有多少人是因为吃了这些食品而死亡（生病）的呢？,"d.t.u.d.c.a.s.t.Sentence(begin=252, end=280)",0,0,但是究竟有多少人是因为吃了这些食品而死亡生病的呢
...,...,...,...,...,...,...
25957,25957,对于我来说[F説][BQ，]学习汉语的苦[B告]是经历人为的障碍[F礙]，而不像其他的外国人...,"d.t.u.d.c.a.s.t.Sentence(begin=455, end=531)",0,0,对于我来说学习汉语的苦是经历人为的障碍而不像其他的外国人因不是他们的母语而遇到了语言语文转换的困难
25958,25958,学习汉语的乐处可真多。,"d.t.u.d.c.a.s.t.Sentence(begin=531, end=542)",0,0,学习汉语的乐处可真多
25959,25959,在看国语电影时[F時]，能了解电影剧情和对白而不需靠字幕和翻译。,"d.t.u.d.c.a.s.t.Sentence(begin=542, end=574)",0,0,在看国语电影时能了解电影剧情和对白而不需靠字幕和翻译
25960,25960,和别人以汉语来交谈的亲切感等等。,"d.t.u.d.c.a.s.t.Sentence(begin=574, end=590)",0,0,和别人以汉语来交谈的亲切感等等


In [49]:
df2.to_csv(r'C:\Users\dawns\Desktop\Master thesis\data\Error annotation dataset(new).csv')

In [24]:
#remove sentence without contents
df1 = pd.read_csv(r'C:\Users\dawns\Desktop\Master thesis\data\Error annotation dataset(new).csv')
df1.dropna(axis=0, how='any', inplace=True)
df1
# delete 1432 sentence without any content, those sentence only contain numbers or punctuation

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sentence,location_list,type_list,error_list,WORDCUT
0,0,0,[BD《]绿色绿色与饥饿[BD》]如果要我在绿色食品与饥饿{CQ中}做出选择的话，我会毫不犹...,"d.t.u.d.c.a.s.t.Sentence(begin=0, end=98)",0,0,绿色绿色与饥饿如果要我在绿色食品与饥饿中做出选择的话我会毫不犹豫地的选择饥饿大家也应该在新闻...
1,1,1,是的[BQ，]虽然科技在发展，人们的需求也越来越多，但是我认为，与其将时间浪费在减少生产量来...,"d.t.u.d.c.a.s.t.Sentence(begin=98, end=178)",0,0,是的虽然科技在发展人们的需求也越来越多但是我认为与其将时间浪费在减少生产量来制造绿色食品还不...
2,2,2,“少而精”，这正是绿色食品的特点，也是那些“吃饱撑着”的人的观点。,"d.t.u.d.c.a.s.t.Sentence(begin=178, end=211)",0,0,少而精这正是绿色食品的特点也是那些吃饱撑着的人的观点
3,3,3,现在普遍饮用的并不定绿色食品，而是短文里所说的所谓“吃了会有害于人体健康”的食品。,"d.t.u.d.c.a.s.t.Sentence(begin=211, end=252)",0,0,现在普遍饮用的并不定绿色食品而是短文里所说的所谓吃了会有害于人体健康的食品
4,4,4,但是，究竟有多少人是因为吃了这些食品而死亡（生病）的呢？,"d.t.u.d.c.a.s.t.Sentence(begin=252, end=280)",0,0,但是究竟有多少人是因为吃了这些食品而死亡生病的呢
...,...,...,...,...,...,...,...
25956,25956,25956,庆幸的是我当时没有接受那位老师[F師]的劝[F勸]告。,"d.t.u.d.c.a.s.t.Sentence(begin=428, end=455)",0,0,庆幸的是我当时没有接受那位老师的劝告
25957,25957,25957,对于我来说[F説][BQ，]学习汉语的苦[B告]是经历人为的障碍[F礙]，而不像其他的外国人...,"d.t.u.d.c.a.s.t.Sentence(begin=455, end=531)",0,0,对于我来说学习汉语的苦是经历人为的障碍而不像其他的外国人因不是他们的母语而遇到了语言语文转换的困难
25958,25958,25958,学习汉语的乐处可真多。,"d.t.u.d.c.a.s.t.Sentence(begin=531, end=542)",0,0,学习汉语的乐处可真多
25959,25959,25959,在看国语电影时[F時]，能了解电影剧情和对白而不需靠字幕和翻译。,"d.t.u.d.c.a.s.t.Sentence(begin=542, end=574)",0,0,在看国语电影时能了解电影剧情和对白而不需靠字幕和翻译


In [25]:
count = df1.loc[:, 'type_list'].value_counts()
count
print(count)

0    22105
1     2425
Name: type_list, dtype: int64


In [26]:
# 欠采样

df3=df1[df1["type_list"]==1]#正样本部分
df0=df1[df1["type_list"]==0]#负样本部分
df4=df0.sample(frac=0.2)
df_new=pd.concat([df3,df4])


In [27]:
df_new

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sentence,location_list,type_list,error_list,WORDCUT
12,12,12,例如：任何豪华的建筑，一开始建造时，都先{CJX}需要打地基[BQ，]就是所谓“基础”，只有...,"d.t.u.d.c.a.s.t.Sentence(begin=453, end=510)",1,都先{CJX},例如：任何豪华的建筑一开始建造时都先需要打地基就是所谓基础只有基础打好了才能提高
14,14,14,还有一种就是将科技再进一步提高，研制出既[B即]不影响人体健康，也不会影响生产量的农药，再将...,"d.t.u.d.c.a.s.t.Sentence(begin=582, end=652)",1,[B即],还有一种就是将科技再进一步提高研制出既不影响人体健康也不会影响生产量的农药再将其普及化给人类...
24,24,24,没有挨过饿的人是不会理解挨饿的痛苦的，所[C]以在先进国家也只有很少的人为这些挨饿的人{CD...,"d.t.u.d.c.a.s.t.Sentence(begin=219, end=281)",1,{CD而},没有挨过饿的人是不会理解挨饿的痛苦的所以在先进国家也只有很少的人为这些挨饿的人而去努力活动
33,33,33,现在世界上的饥饿问题也不亚于环境问题，因此有的人对于{CC由于}{CJ-zhuy人们}缺少粮...,"d.t.u.d.c.a.s.t.Sentence(begin=108, end=189)",1,{CC由于}{,现在世界上的饥饿问题也不亚于环境问题因此有的人对于由于人们缺少粮食而挨饿的问题而提出先提高该...
57,57,57,如果每天吃{CD了}用化肥和农药的农作物{CQ就}不能{CC会}健康地生活{CD着}。,"d.t.u.d.c.a.s.t.Sentence(begin=322, end=365)",1,{CQ就},如果每天吃了用化肥和农药的农作物就不能会健康地生活着
...,...,...,...,...,...,...,...
6016,6016,6016,[BC.]没有别的[C]办法了。,"d.t.u.d.c.a.s.t.Sentence(begin=355, end=371)",0,0,没有别的办法了
22302,22302,22302,对于这个问题，我想首先[BD，]不同的国家会有不同的理解。,"d.t.u.d.c.a.s.t.Sentence(begin=13, end=42)",0,0,对于这个问题我想首先不同的国家会有不同的理解
405,405,405,发达国家应该帮助{CJ-by他们}，提供技术。,"d.t.u.d.c.a.s.t.Sentence(begin=563, end=586)",0,0,发达国家应该帮助他们提供技术
13687,13687,13687,论学业，虽然不是个顶尖人物，却也是个不错的人才。,"d.t.u.d.c.a.s.t.Sentence(begin=32, end=56)",0,0,论学业虽然不是个顶尖人物却也是个不错的人才


In [28]:
x_train, x_test, y_train, y_test = train_test_split(df_new['WORDCUT'], df_new['type_list'], test_size=0.2)
x_train = x_train.apply(lambda x: preprocessing_sentence(x))
x_test = x_test.apply(lambda x: preprocessing_sentence(x))



#TF-IDF
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train).toarray()
x_test = tf.transform(x_test).toarray()

In [29]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
train_score = lr.score(x_train, y_train)
y_pred = lr.predict(x_test)
print("Train Accuracy:", train_score)
print("================================================")
y_pred = lr.predict(x_test)
test_score = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_score)
print("================================================")
acc_for_each_class = metrics.precision_score(y_test,y_pred,average=None)
print("acc_for_each_class:\n",acc_for_each_class)
print("================================================")
qwkappa = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("qwkappa:\n",qwkappa)
print("================================================")
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")
classification_rep = classification_report(y_test,y_pred)
print("classification report: \n",classification_rep)
print("================================================")
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
print(confusion_mat)

Train Accuracy: 0.7753834915997078
Test Accuracy: 0.6912408759124088
acc_for_each_class:
 [0.7092511  0.60425532]
qwkappa:
 0.22395714763977237
Weighted precision 0.6730773719891109
Weighted recall 0.6912408759124088
Weighted f1-score 0.6574869041919021
classification report: 
               precision    recall  f1-score   support

           0       0.71      0.90      0.79       898
           1       0.60      0.30      0.40       472

    accuracy                           0.69      1370
   macro avg       0.66      0.60      0.60      1370
weighted avg       0.67      0.69      0.66      1370

[[805  93]
 [330 142]]


In [30]:
scoring = {'recall': 'recall','qwkappa': make_scorer(cohen_kappa_score,weights='quadratic')}
scores = cross_validate(lr, x_train, y_train, cv=5, scoring=scoring)#5-fold cv
recall = scores['test_recall']
qwkappa = scores['test_qwkappa']
print(scores)
print('Mean recall:',recall.mean())
print('Mean qwkappa:',qwkappa.mean())

{'fit_time': array([2.12407684, 1.84209514, 1.94527745, 1.93135524, 1.81014657]), 'score_time': array([0.01854825, 0.01800227, 0.01633382, 0.01760387, 0.01823878]), 'test_recall': array([0.25319693, 0.23529412, 0.27621483, 0.25384615, 0.24871795]), 'test_qwkappa': array([0.19368309, 0.17718562, 0.25820367, 0.18218801, 0.19765018])}
Mean recall: 0.25345399698340876
Mean qwkappa: 0.20178211271444485


In [12]:
from sklearn import svm
model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma='scale'))#linear
clt = model.fit(x_train,y_train)
y_pred = clt.predict(x_test)
test_score = accuracy_score(y_test, y_pred)
train_score = clt.score(x_train, y_train)
print("Train Accuracy:", train_score)
print("================================================")
print("Test Accuracy:", test_score)
print("================================================")
acc_for_each_class = metrics.precision_score(y_test,y_pred,average=None)
print("acc_for_each_class:\n",acc_for_each_class)
print("================================================")
qwkappa = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("qwkappa:\n",qwkappa)
print("================================================")
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")

Train Accuracy: 0.9475894813732652
Test Accuracy: 0.6992700729927007
acc_for_each_class:
 [0.70320405 0.67391304]
qwkappa:
 0.22576786752844868
Weighted precision 0.6930270269402418
Weighted recall 0.6992700729927007
Weighted f1-score 0.6538538954962313


In [None]:
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")
classification_rep = classification_report(y_test,y_pred)
print("classification report: \n",classification_rep)
print("================================================")
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
print(confusion_mat)

In [12]:
from sklearn.naive_bayes import BernoulliNB
NB = BernoulliNB()
NB.fit(x_train, y_train)
y_pred = NB.predict(x_test)
test_score = accuracy_score(y_test, y_pred)
train_score = NB.score(x_train, y_train)
print("Train Accuracy:", train_score)
print("================================================")
print("Test Accuracy:", test_score)
print("================================================")
acc_for_each_class = metrics.precision_score(y_test,y_pred,average=None)
print("acc_for_each_class:\n",acc_for_each_class)
print("================================================")
qwkappa = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("qwkappa:\n",qwkappa)
print("================================================")
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")
classification_rep = classification_report(y_test,y_pred)
print("classification report: \n",classification_rep)
print("================================================")
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
print(confusion_mat)



Train Accuracy: 0.8011322132943755
Test Accuracy: 0.6839416058394161
acc_for_each_class:
 [0.67874794 0.72435897]
qwkappa:
 0.20469932725025275
Weighted precision 0.6954941815200477
Weighted recall 0.6839416058394161
Weighted f1-score 0.6270813368458579
classification report: 
               precision    recall  f1-score   support

           0       0.68      0.95      0.79       867
           1       0.72      0.22      0.34       503

    accuracy                           0.68      1370
   macro avg       0.70      0.59      0.57      1370
weighted avg       0.70      0.68      0.63      1370

[[824  43]
 [390 113]]


In [60]:
rf = ensemble.RandomForestClassifier()
params = {'n_estimators': [100], 'max_depth':[10], 'max_features':[100]}
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
train_score = rf.score(x_train, y_train)
test_score = accuracy_score(y_test, y_pred)
print("Train Accuracy:", train_score)
print("================================================")
print("Test Accuracy:", test_score)
print("================================================")
acc_for_each_class = metrics.precision_score(y_test,y_pred,average=None)
print("acc_for_each_class:\n",acc_for_each_class)
print("================================================")
qwkappa = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("qwkappa:\n",qwkappa)
print("================================================")
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print("================================================")
classification_rep = classification_report(y_test,y_pred)
print("classification report: \n",classification_rep)
print("================================================")
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
print(confusion_mat)

Train Accuracy: 0.9994521548575602
Test Accuracy: 0.6846715328467153
acc_for_each_class:
 [0.69230769 0.63218391]
qwkappa:
 0.18579824349012797
Weighted precision 0.6713301967769625
Weighted recall 0.6846715328467153
Weighted f1-score 0.6341143207447818
classification report: 
               precision    recall  f1-score   support

           0       0.69      0.93      0.79       892
           1       0.63      0.23      0.34       478

    accuracy                           0.68      1370
   macro avg       0.66      0.58      0.57      1370
weighted avg       0.67      0.68      0.63      1370

[[828  64]
 [368 110]]
