In [1]:
import pandas as pd
import numpy as np
from nltk import ngrams
import re

In [2]:
#https://github.com/duyetdev/truyenkieu-word2vec/blob/master/word2vec.ipynb
def transform_row(row):
    
    row = str(row)
    
    # Remove start-of-line numerations
    row = re.sub(r"^[0-9\.]+", "", row)
    
    # Remove end-of-line punctuations
    row = re.sub(r"[\.,\?]+$", "", row)
    
    # Remove in-line punctuations
    row = row.replace(",", " ").replace(".", " ") \
        .replace(";", " ").replace("“", " ") \
        .replace(":", " ").replace("”", " ") \
        .replace('"', " ").replace("'", " ") \
        .replace("!", " ").replace("?", " ")
    
    row = row.strip()
    return row


In [3]:
df1 = pd.read_csv("XuanDieu.csv").T.reset_index()
df1 = df1[~df1['index'].str.contains("<")].reset_index(drop=True) #drop any lines still containing html tags
df1['index'] = df1['index'].apply(transform_row)
df1['author'] = 'Xuân Diệu'

df2 = pd.read_csv("HoXuanHuong.csv").T.reset_index()
df2 = df2[~df2['index'].str.contains("<")].reset_index(drop=True) #drop any lines still containing html tags
df2['index'] = df2['index'].apply(transform_row)
df2['author'] = 'Hồ Xuân Hương'

df3 = pd.read_csv("./truyen_kieu_data.txt",sep="/", names=["index"], encoding="utf8").dropna()
df3 = df3[~df3['index'].str.contains("<")].reset_index(drop=True) #drop any lines still containing html tags
df3['index'] = df3['index'].apply(transform_row)
df3['author'] = 'Nguyễn Du'

In [42]:
df = df1.append(df2).append(df3)
for k, group in df.groupby('author'):
    print(k)
    print(group.count())

Hồ Xuân Hương
index     692
author    692
dtype: int64
Nguyễn Du
index     3258
author    3258
dtype: int64
Xuân Diệu
index     1174
author    1174
dtype: int64


In [47]:
df['sentence length']=df['index'].apply(lambda x: len(x.split()))

Unnamed: 0,index,author,sentence length
0,Mẹ hiền ơi chúng con như lúa sạ,Xuân Diệu,8
1,Thấy nước lên phải lên kịp với triều,Xuân Diệu,8
2,Đầu lúa nhỏ nhưng mà đông lúa cả,Xuân Diệu,8
3,Lên rào rào như một lũ tầm kêu,Xuân Diệu,8
4,Đêm dưới nước ngạt vô cùng nô lệ,Xuân Diệu,8
5,Đời chìm đắm bóp chẹt hết mầu vui,Xuân Diệu,8
6,Nên bông lúa chạy thi từng thế hệ,Xuân Diệu,8
7,Lên trên trời nở lấy hạt vàng chơi,Xuân Diệu,8
8,Mẹ yêu dấu chúng con đùa thế đó,Xuân Diệu,8
9,Mẹ chúng con đây lòng chúng con đây,Xuân Diệu,8


In [61]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# TF-IDF
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(df['index'])
X_tfidf_feat = pd.concat([df['sentence length'].reset_index(drop=True), pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer

count_vect = CountVectorizer()
X_count = count_vect.fit_transform(df['index'])
X_count_feat = pd.concat([df['sentence length'].reset_index(drop=True), pd.DataFrame(X_count.toarray())], axis=1)

X_tfidf_feat.head(10)

Unnamed: 0,sentence length,0,1,2,3,4,5,6,7,8,...,3117,3118,3119,3120,3121,3122,3123,3124,3125,3126
0,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_count_feat, df['author'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
11,31.934727,3.613745,0.343514,0.086763,,300,"{'max_depth': None, 'n_estimators': 300}",0.787524,0.829435,0.827317,...,0.82299,0.018155,1,1.0,0.999756,0.999756,0.999756,0.999756,0.999805,9.8e-05
10,17.566115,0.217325,0.211564,0.009804,,150,"{'max_depth': None, 'n_estimators': 150}",0.794347,0.833333,0.818537,...,0.818696,0.015171,2,1.0,0.999756,0.999756,0.999756,0.999756,0.999805,9.8e-05
8,24.87776,0.08954,0.278139,0.031603,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.7846,0.792398,0.795122,...,0.79918,0.010985,3,0.938507,0.939239,0.938522,0.944878,0.936601,0.939549,0.002805
7,13.253245,0.285963,0.167446,0.019812,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.789474,0.7846,0.797073,...,0.797424,0.009547,4,0.939483,0.935334,0.937546,0.938049,0.939039,0.93789,0.001451
9,1.566566,0.080495,0.027675,0.001205,,10,"{'max_depth': None, 'n_estimators': 10}",0.765107,0.807018,0.773659,...,0.796838,0.023038,5,0.992191,0.990727,0.989266,0.99122,0.99049,0.990779,0.000956


In [64]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_tfidf_feat, df['author'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
10,18.038972,0.336571,0.231817,0.0128,,150,"{'max_depth': None, 'n_estimators': 150}",0.787524,0.840156,0.82439,...,0.821429,0.019388,1,1.0,0.999756,0.999756,0.999756,0.999756,0.999805,9.8e-05
11,33.216335,3.875906,0.338099,0.083265,,300,"{'max_depth': None, 'n_estimators': 300}",0.788499,0.84308,0.826341,...,0.820843,0.019028,2,1.0,0.999756,0.999756,0.999756,0.999756,0.999805,9.8e-05
7,12.384936,0.192519,0.159223,0.017255,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.790448,0.78655,0.796098,...,0.797424,0.008836,3,0.937042,0.928258,0.931935,0.936098,0.933187,0.933304,0.003135
9,1.502395,0.06321,0.046324,0.001944,,10,"{'max_depth': None, 'n_estimators': 10}",0.783626,0.807992,0.793171,...,0.796253,0.008389,4,0.989019,0.988775,0.989998,0.992683,0.987808,0.989656,0.001666
8,24.238259,0.135649,0.280947,0.020834,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.785575,0.7846,0.797073,...,0.795667,0.009677,5,0.942167,0.928258,0.93657,0.937805,0.932699,0.9355,0.004716


In [69]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score as acs
import matplotlib.pyplot as plt
import seaborn as sns

In [82]:
from sklearn.model_selection import train_test_split

X=df[['index', 'sentence length']]
y=df['author']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [74]:
count_vect = CountVectorizer()
count_vect_fit = count_vect.fit(X_train['index'])

count_train = count_vect_fit.transform(X_train['index'])
count_test = count_vect_fit.transform(X_test['index'])

X_train_vect = pd.concat([X_train['sentence length'].reset_index(drop=True), pd.DataFrame(count_train.toarray())], axis=1)

X_test_vect = pd.concat([X_test['sentence length'].reset_index(drop=True), pd.DataFrame(count_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,sentence length,0,1,2,3,4,5,6,7,8,...,2834,2835,2836,2837,2838,2839,2840,2841,2842,2843
0,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
rf = RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1) 

rf_model = rf.fit(X_train_vect, y_train)

y_pred = rf_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, average='weighted')
print('Precision: {} / Recall: {} / F1-Score: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), round(acs(y_test,y_pred), 3)))

Precision: 0.837 / Recall: 0.841 / F1-Score: 0.832 / Accuracy: 0.841
