In [140]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
%matplotlib inline


In [141]:
train = pd.read_csv(r"train\train.csv")
test = pd.read_csv(r"test\test.csv")

In [142]:
train.head()

Unnamed: 0,id,text,author
0,id00001,Idris was well content with this resolve of mine.,MWS
1,id00002,"I was faint, even fainter than the hateful mod...",HPL
2,id00003,"Above all, I burn to know the incidents of you...",EAP
3,id00004,"He might see, perhaps, one or two points with ...",EAP
4,id00005,All obeyed the Lord Protector of dying England...,MWS


In [143]:
train.apply(lambda x: sum(x.isnull()),axis=0)

id        0
text      0
author    0
dtype: int64

In [144]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ME\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ME\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [145]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [146]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemm = WordNetLemmatizer()
ps=PorterStemmer()
def preprocessing(data):
    txt = data.lower() #1
    words = tokenizer.tokenize(txt) #2
    words = [w for w in words if not w in stop_words] #3
    words = [ps.stem(w) for w in words]
    words = ' '.join(words)#4
    return words

In [147]:
train_df=train
len(train)

19579

In [148]:
for i in range(0,len(train)):
    train_df["text"][i]= preprocessing(train["text"][i])


In [149]:
train_df['author'] = train_df.author.map({'EAP':0, 'HPL':1, 'MWS':2})

In [150]:
train_df.head()

Unnamed: 0,id,text,author
0,id00001,idri well content resolv mine,2
1,id00002,faint even fainter hate modern accurs citi made,1
2,id00003,burn know incid passag dark valley shadow,0
3,id00004,might see perhap one two point unusu clear nec...,0
4,id00005,obey lord protector die england look,2


In [151]:
X = train_df['text']
y = train_df['author']

In [152]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [153]:
# TO check if the authors are evenly distributed
print(y_train.value_counts())
print(y_test.value_counts())

0    6352
2    4815
1    4496
Name: author, dtype: int64
0    1548
2    1229
1    1139
Name: author, dtype: int64


# CountVectorizer :

Countvectorizer gives out the sparse matrix which contains the frequency of each word in respective documents/Data.

In [154]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [155]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

# TF-IDF

Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics. To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies. Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus. This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

https://manjunathhiremathm.wixsite.com/portfolio/blog-1/countvectorizer-v-s-tfidfvector

In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [157]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Using Logistic Regression with CountVectorizer to evaluate the author


In [158]:
from sklearn.linear_model import LogisticRegression
LR_cv = LogisticRegression()
LR_cv.fit(X_train_cv, y_train)
y_test_LR = LR_cv.predict(X_test_cv)



In [159]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_test_LR))

0.789581205312


Confusion matrix : A confusion matrix is a matrix where we can see where the predicted values are, and where they should be.

In [160]:
from sklearn.metrics import confusion_matrix
cn = confusion_matrix(y_test, y_test_LR)
cn

array([[1315,  103,  130],
       [ 212,  849,   78],
       [ 223,   78,  928]])

Using Logistic Regression model with the entire dataset from "train.csv"

In [161]:
X_cv = cv.fit_transform(X)

In [162]:
LR_cv.fit(X_cv,y)
y_LR = LR_cv.predict(X_cv)
metrics.accuracy_score(y, y_LR)


0.93692221257469743

Predicting the author of the text given in the test.csv. Predicting the probabilities

In [163]:
test_cv = cv.transform(test['text'])


In [164]:
y_test_LR = LR_cv.predict(test_cv)

In [165]:
y_test_LR

array([2, 0, 0, ..., 0, 2, 0], dtype=int64)

In [166]:
y_test_LR_prob = LR_cv.predict_proba(test_cv)
y_test_LR_prob

array([[ 0.111541  ,  0.05499296,  0.83346603],
       [ 0.74820182,  0.21415462,  0.03764356],
       [ 0.75266949,  0.16106805,  0.08626246],
       ..., 
       [ 0.56147608,  0.10286553,  0.33565839],
       [ 0.47166303,  0.0189892 ,  0.50934777],
       [ 0.50337763,  0.49153645,  0.00508592]])

In [167]:
result_LR_cv = pd.DataFrame(y_test_LR_prob, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_LR_cv.insert(0, 'id', test['id'])
result_LR_cv.insert(1, 'text', test['text'])
result_LR_cv.insert(2, 'Predicted',y_test_LR)
result_LR_cv.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.111541,0.054993,0.833466
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.748202,0.214155,0.037644
2,id00134,And when they had broken down the frail door t...,0,0.752669,0.161068,0.086262
3,id27757,While I was thinking how I should possibly man...,0,0.573506,0.343449,0.083046
4,id04081,I am not sure to what limit his knowledge may ...,0,0.851077,0.117539,0.031384


In [168]:
result_LR_cv['Predicted_Author'] = result_LR_cv.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_LR_cv.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.111541,0.054993,0.833466,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.748202,0.214155,0.037644,EAP
2,id00134,And when they had broken down the frail door t...,0,0.752669,0.161068,0.086262,EAP
3,id27757,While I was thinking how I should possibly man...,0,0.573506,0.343449,0.083046,EAP
4,id04081,I am not sure to what limit his knowledge may ...,0,0.851077,0.117539,0.031384,EAP


In [169]:
from pandas import ExcelWriter

writer = ExcelWriter('Predicted Author.xlsx' , engine='xlsxwriter')
result_LR_cv.to_excel(writer,sheet_name='LogisticRegression_CountVec',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [170]:
#NB

In [171]:
from sklearn.naive_bayes import MultinomialNB
NB_cv = MultinomialNB()
NB_cv.fit(X_train_cv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [172]:
y_nb_cv = NB_cv.predict(X_test_cv)
print(metrics.accuracy_score(y_test, y_nb_cv))

0.82277834525


In [173]:
X_cv = cv.fit_transform(X)
X_cv

<19579x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 222396 stored elements in Compressed Sparse Row format>

In [174]:
NB_cv.fit(X_cv,y)
y_nb_cv = NB_cv.predict(X_cv)
metrics.accuracy_score(y, y_nb_cv)

0.88227182184994124

In [175]:
test_cv = cv.transform(test['text'])
test_cv

<8392x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 43644 stored elements in Compressed Sparse Row format>

In [176]:
y_pred_nb_cv = NB_cv.predict(test_cv)
y_pred_nb_cv

array([2, 0, 0, ..., 0, 2, 0], dtype=int64)

In [177]:
y_pred_prob_nb_cv = NB_cv.predict_proba(test_cv)
y_pred_prob_nb_cv

array([[  5.78020295e-03,   1.10145784e-02,   9.83205219e-01],
       [  9.87252695e-01,   1.23984564e-02,   3.48848452e-04],
       [  5.15246729e-01,   4.72347945e-01,   1.24053261e-02],
       ..., 
       [  6.98617621e-01,   7.80223543e-02,   2.23360024e-01],
       [  5.89312256e-02,   1.97645131e-03,   9.39092323e-01],
       [  5.79469385e-01,   4.17610417e-01,   2.92019781e-03]])

In [178]:
result = pd.DataFrame(y_pred_prob_nb_cv, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result.insert(0, 'id', test['id'])
result.insert(1, 'text', test['text'])
result.insert(2, 'Predicted',y_pred_nb_cv)
result.head()


Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.00578,0.011015,0.983205
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.987253,0.012398,0.000349
2,id00134,And when they had broken down the frail door t...,0,0.515247,0.472348,0.012405
3,id27757,While I was thinking how I should possibly man...,1,0.205567,0.782408,0.012026
4,id04081,I am not sure to what limit his knowledge may ...,0,0.941036,0.051086,0.007877


In [179]:
result['Predicted_Author'] = result.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.00578,0.011015,0.983205,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.987253,0.012398,0.000349,EAP
2,id00134,And when they had broken down the frail door t...,0,0.515247,0.472348,0.012405,EAP
3,id27757,While I was thinking how I should possibly man...,1,0.205567,0.782408,0.012026,HPL
4,id04081,I am not sure to what limit his knowledge may ...,0,0.941036,0.051086,0.007877,EAP


In [180]:
result.to_excel(writer,sheet_name='NavieBayes_CountVec',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

# SVM with countVectorizer

In [181]:
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.svm import SVC

In [182]:
svd = decomposition.TruncatedSVD(n_components=120)

In [183]:
svd.fit(X_train_cv)

TruncatedSVD(algorithm='randomized', n_components=120, n_iter=5,
       random_state=None, tol=0.0)

In [184]:
xtrain_svd = svd.transform(X_train_cv)
xtest_svd = svd.transform(X_test_cv)

In [185]:
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xtest_svd)

In [186]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [187]:
y_pred_svd = clf.predict(xvalid_svd_scl)

In [188]:
print(metrics.accuracy_score(y_test, y_pred_svd))

0.665219611849


In [189]:
X_cv = cv.fit_transform(X)
X_cv
svd.fit(X_cv)
x_svd = svd.transform(X_cv)

In [None]:
scl.fit(x_svd)
x_svd = scl.transform(x_svd)

In [None]:
clf.fit(x_svd, y)

In [None]:
y_svd = clf.predict(x_svd)

In [None]:
print(metrics.accuracy_score(y, y_svd))

In [194]:
test_cv = cv.transform(test['text'])
test_cv
test_svd = svd.transform(test_cv)
test_svd = scl.transform(test_svd)

In [195]:
y_test_svd = clf.predict(test_svd)

In [196]:
y_test_prob_svd = clf.predict_proba(test_svd)


In [197]:
result_svd = pd.DataFrame(y_test_prob_svd, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_svd.insert(0, 'id', test['id'])
result_svd.insert(1, 'text', test['text'])
result_svd.insert(2, 'Predicted',y_test_svd)
result_svd.head()


Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.006625,0.0169,0.976476
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.743018,0.14003,0.116952
2,id00134,And when they had broken down the frail door t...,1,0.369743,0.527637,0.10262
3,id27757,While I was thinking how I should possibly man...,0,0.710094,0.224334,0.065572
4,id04081,I am not sure to what limit his knowledge may ...,0,0.830802,0.129624,0.039574


In [198]:
result_svd['Predicted_Author'] = result_svd.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_svd.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.006625,0.0169,0.976476,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.743018,0.14003,0.116952,EAP
2,id00134,And when they had broken down the frail door t...,1,0.369743,0.527637,0.10262,HPL
3,id27757,While I was thinking how I should possibly man...,0,0.710094,0.224334,0.065572,EAP
4,id04081,I am not sure to what limit his knowledge may ...,0,0.830802,0.129624,0.039574,EAP


In [199]:
result_svd.to_excel(writer,sheet_name='SVC_CountVec',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [200]:
#KNN


In [201]:
from sklearn.neighbors import KNeighborsClassifier


In [202]:
KNN = KNeighborsClassifier(n_neighbors = 5,metric="minkowski",p=2)

In [203]:
KNN.fit(X_train_cv,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [204]:
y_pred_knn = KNN.predict(X_test_cv)

In [205]:
metrics.accuracy_score(y_test,y_pred_knn)

0.42441266598569971

In [206]:
X_cv = cv.fit_transform(X)
X_cv

<19579x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 222396 stored elements in Compressed Sparse Row format>

In [207]:
KNN.fit(X_cv,y)
y_knn = KNN.predict(X_cv)
metrics.accuracy_score(y, y_knn)

0.61800909137341031

In [208]:
test_cv = cv.transform(test['text'])
test_cv

<8392x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 43644 stored elements in Compressed Sparse Row format>

In [209]:
y_pred_knn = KNN.predict(test_cv)
y_pred_knn

array([0, 0, 0, ..., 1, 2, 0], dtype=int64)

In [210]:
y_pred_prob_knn = KNN.predict_proba(test_cv)
y_pred_prob_knn

array([[ 0.4,  0.2,  0.4],
       [ 0.8,  0.2,  0. ],
       [ 0.8,  0. ,  0.2],
       ..., 
       [ 0.2,  0.4,  0.4],
       [ 0.4,  0. ,  0.6],
       [ 0.8,  0. ,  0.2]])

In [211]:
result_knn = pd.DataFrame(y_pred_prob_knn, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_knn.insert(0, 'id', test['id'])
result_knn.insert(1, 'text', test['text'])
result_knn.insert(2, 'Predicted',y_pred_knn)
result_knn.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",0,0.4,0.2,0.4
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.8,0.2,0.0
2,id00134,And when they had broken down the frail door t...,0,0.8,0.0,0.2
3,id27757,While I was thinking how I should possibly man...,2,0.4,0.0,0.6
4,id04081,I am not sure to what limit his knowledge may ...,0,0.4,0.2,0.4


In [212]:
result_knn['Predicted_Author'] = result_knn.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_knn.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",0,0.4,0.2,0.4,EAP
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.8,0.2,0.0,EAP
2,id00134,And when they had broken down the frail door t...,0,0.8,0.0,0.2,EAP
3,id27757,While I was thinking how I should possibly man...,2,0.4,0.0,0.6,MWS
4,id04081,I am not sure to what limit his knowledge may ...,0,0.4,0.2,0.4,EAP


In [213]:
result_knn.to_excel(writer,sheet_name='KNN_CountVec',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [214]:
#Decision Tree

In [215]:
from sklearn.tree import DecisionTreeClassifier

In [216]:
decisiontree = DecisionTreeClassifier(criterion='entropy',random_state=0)

In [217]:
decisiontree.fit(X_train_cv,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [218]:
y_pred_decision = decisiontree.predict(X_test_cv)

In [219]:
metrics.accuracy_score(y_test,y_pred_decision)

0.59346271705822262

In [220]:
X_cv = cv.fit_transform(X)
X_cv

<19579x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 222396 stored elements in Compressed Sparse Row format>

In [221]:
decisiontree.fit(X_cv,y)
y_decisiontree = decisiontree.predict(X_cv)
metrics.accuracy_score(y, y_decisiontree)

0.99897849736963074

In [222]:
test_cv = cv.transform(test['text'])
test_cv

<8392x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 43644 stored elements in Compressed Sparse Row format>

In [223]:
y_pred_decision = decisiontree.predict(test_cv)
y_pred_decision

array([2, 0, 1, ..., 0, 2, 0], dtype=int64)

In [224]:
y_pred_prob_decision = decisiontree.predict_proba(test_cv)
y_pred_prob_decision

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]])

In [225]:
result_decision = pd.DataFrame(y_pred_prob_decision, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_decision.insert(0, 'id', test['id'])
result_decision.insert(1, 'text', test['text'])
result_decision.insert(2, 'Predicted',y_pred_decision)
result_decision.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.0,1.0
1,id24541,"If a fire wanted fanning, it could readily be ...",0,1.0,0.0,0.0
2,id00134,And when they had broken down the frail door t...,1,0.0,1.0,0.0
3,id27757,While I was thinking how I should possibly man...,2,0.0,0.0,1.0
4,id04081,I am not sure to what limit his knowledge may ...,0,1.0,0.0,0.0


In [226]:
result_decision['Predicted_Author'] = result_decision.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_decision.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.0,1.0,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,1.0,0.0,0.0,EAP
2,id00134,And when they had broken down the frail door t...,1,0.0,1.0,0.0,HPL
3,id27757,While I was thinking how I should possibly man...,2,0.0,0.0,1.0,MWS
4,id04081,I am not sure to what limit his knowledge may ...,0,1.0,0.0,0.0,EAP


In [227]:
result_decision.to_excel(writer,sheet_name='DecisionTree_CountVec',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [228]:
#Random Forest

In [229]:
from sklearn.ensemble import RandomForestClassifier

In [230]:
RFC = RandomForestClassifier(n_estimators=15,criterion='entropy',random_state=0)

In [231]:
RFC.fit(X_train_cv,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [232]:
y_test_RFC=RFC.predict(X_test_cv)

In [233]:
metrics.accuracy_score(y_test,y_test_RFC)

0.6657303370786517

In [234]:
X_cv = cv.fit_transform(X)
X_cv

<19579x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 222396 stored elements in Compressed Sparse Row format>

In [235]:
RFC.fit(X_cv,y)
y_RFC = RFC.predict(X_cv)
metrics.accuracy_score(y, y_RFC)

0.9948414117166352

In [236]:
test_cv = cv.transform(test['text'])
test_cv

<8392x15322 sparse matrix of type '<class 'numpy.int64'>'
	with 43644 stored elements in Compressed Sparse Row format>

In [237]:
y_pred_RFC = RFC.predict(test_cv)
y_pred_RFC

array([2, 0, 1, ..., 0, 2, 0], dtype=int64)

In [238]:
y_pred_prob_RFC = RFC.predict_proba(test_cv)
y_pred_prob_RFC

array([[ 0.        ,  0.13333333,  0.86666667],
       [ 0.93333333,  0.        ,  0.06666667],
       [ 0.33333333,  0.6       ,  0.06666667],
       ..., 
       [ 0.78863826,  0.04498747,  0.16637427],
       [ 0.13333333,  0.        ,  0.86666667],
       [ 0.86666667,  0.13333333,  0.        ]])

In [239]:
result_RFC = pd.DataFrame(y_pred_prob_RFC, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_RFC.insert(0, 'id', test['id'])
result_RFC.insert(1, 'text', test['text'])
result_RFC.insert(2, 'Predicted',y_pred_RFC)
result_RFC.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.133333,0.866667
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.933333,0.0,0.066667
2,id00134,And when they had broken down the frail door t...,1,0.333333,0.6,0.066667
3,id27757,While I was thinking how I should possibly man...,1,0.2,0.533333,0.266667
4,id04081,I am not sure to what limit his knowledge may ...,0,0.855556,0.101389,0.043056


In [240]:
result_RFC['Predicted_Author'] = result_RFC.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_RFC.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.133333,0.866667,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.933333,0.0,0.066667,EAP
2,id00134,And when they had broken down the frail door t...,1,0.333333,0.6,0.066667,HPL
3,id27757,While I was thinking how I should possibly man...,1,0.2,0.533333,0.266667,HPL
4,id04081,I am not sure to what limit his knowledge may ...,0,0.855556,0.101389,0.043056,EAP


In [241]:
result_RFC.to_excel(writer,sheet_name='RandomForest_CountVec',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

# TF-IDF

In [242]:
#Logistic Regression with Tf-idf

In [243]:
from sklearn.linear_model import LogisticRegression
LR_tfidf = LogisticRegression()
LR_tfidf.fit(X_train_tfidf, y_train)
y_test_LR = LR_tfidf.predict(X_test_tfidf)



In [244]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_test_LR))

0.801583248212


In [245]:
from sklearn.metrics import confusion_matrix
cn = confusion_matrix(y_test, y_test_LR)
cn

array([[1334,  101,  113],
       [ 203,  875,   61],
       [ 221,   78,  930]])

In [246]:
X_tfidf = tfidf.fit_transform(X)

In [247]:
LR_tfidf.fit(X_tfidf,y)
y_tfidf = LR_tfidf.predict(X_tfidf)
metrics.accuracy_score(y, y_tfidf)


0.89110781960263552

In [248]:
test_tfidf = tfidf.transform(test['text'])


In [249]:
y_test_LR = LR_tfidf.predict(test_tfidf)

In [250]:
y_test_LR

array([2, 0, 0, ..., 0, 2, 0], dtype=int64)

In [251]:
y_test_LR_prob = LR_tfidf.predict_proba(test_tfidf)
y_test_LR_prob

array([[ 0.20973185,  0.08483839,  0.70542976],
       [ 0.47603747,  0.30873769,  0.21522483],
       [ 0.5305958 ,  0.32520859,  0.14419562],
       ..., 
       [ 0.41648784,  0.22405744,  0.35945471],
       [ 0.24707586,  0.112402  ,  0.64052214],
       [ 0.54314605,  0.34780422,  0.10904974]])

In [252]:
result_LR_tfidf = pd.DataFrame(y_test_LR_prob, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_LR_tfidf.insert(0, 'id', test['id'])
result_LR_tfidf.insert(1, 'text', test['text'])
result_LR_tfidf.insert(2, 'Predicted',y_test_LR)
result_LR_tfidf.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.209732,0.084838,0.70543
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.476037,0.308738,0.215225
2,id00134,And when they had broken down the frail door t...,0,0.530596,0.325209,0.144196
3,id27757,While I was thinking how I should possibly man...,1,0.401243,0.417736,0.181021
4,id04081,I am not sure to what limit his knowledge may ...,0,0.790123,0.122302,0.087575


In [253]:
result_LR_tfidf['Predicted_Author'] = result_LR_tfidf.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_LR_tfidf.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.209732,0.084838,0.70543,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.476037,0.308738,0.215225,EAP
2,id00134,And when they had broken down the frail door t...,0,0.530596,0.325209,0.144196,EAP
3,id27757,While I was thinking how I should possibly man...,1,0.401243,0.417736,0.181021,HPL
4,id04081,I am not sure to what limit his knowledge may ...,0,0.790123,0.122302,0.087575,EAP


In [254]:
#from pandas import ExcelWriter

#writer = ExcelWriter('Predicted Author.xlsx' , engine='xlsxwriter')
result_LR_tfidf.to_excel(writer,sheet_name='LogisticRegression_TF-IDF',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [255]:
#NB-TFIDF

In [256]:
from sklearn.naive_bayes import MultinomialNB
NB_tfidf = MultinomialNB()
NB_tfidf.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [257]:
y_nb_tfidf = NB_tfidf.predict(X_test_tfidf)
print(metrics.accuracy_score(y_test, y_nb_tfidf))

0.813329928498


In [258]:
X_tfidf = tfidf.fit_transform(X)
X_tfidf

<19579x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 248708 stored elements in Compressed Sparse Row format>

In [259]:
NB_tfidf.fit(X_tfidf,y)
y_nb_tfidf = NB_tfidf.predict(X_tfidf)
metrics.accuracy_score(y, y_nb_tfidf)

0.88554063026712293

In [260]:
test_tfidf = tfidf.transform(test['text'])
test_tfidf

<8392x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 65695 stored elements in Compressed Sparse Row format>

In [261]:
y_pred_nb_tfidf = NB_tfidf.predict(test_tfidf)
y_pred_nb_tfidf

array([2, 0, 0, ..., 0, 2, 0], dtype=int64)

In [262]:
y_pred_prob_nb_tfidf = NB_tfidf.predict_proba(test_tfidf)
y_pred_prob_nb_tfidf

array([[ 0.17020005,  0.12610037,  0.70369957],
       [ 0.41183583,  0.32855372,  0.25961044],
       [ 0.41578313,  0.36300928,  0.22120759],
       ..., 
       [ 0.52699082,  0.17569507,  0.2973141 ],
       [ 0.2253899 ,  0.14293414,  0.63167596],
       [ 0.44629432,  0.33572895,  0.21797673]])

In [263]:
result = pd.DataFrame(y_pred_prob_nb_tfidf, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result.insert(0, 'id', test['id'])
result.insert(1, 'text', test['text'])
result.insert(2, 'Predicted',y_pred_nb_tfidf)
result.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.1702,0.1261,0.7037
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.411836,0.328554,0.25961
2,id00134,And when they had broken down the frail door t...,0,0.415783,0.363009,0.221208
3,id27757,While I was thinking how I should possibly man...,1,0.26623,0.54647,0.187299
4,id04081,I am not sure to what limit his knowledge may ...,0,0.746673,0.142763,0.110564


In [264]:
result['Predicted_Author'] = result.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.1702,0.1261,0.7037,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.411836,0.328554,0.25961,EAP
2,id00134,And when they had broken down the frail door t...,0,0.415783,0.363009,0.221208,EAP
3,id27757,While I was thinking how I should possibly man...,1,0.26623,0.54647,0.187299,HPL
4,id04081,I am not sure to what limit his knowledge may ...,0,0.746673,0.142763,0.110564,EAP


In [265]:
result.to_excel(writer,sheet_name='NavieBayes_TF-IDF',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

# SVM with TFIDF


In [266]:
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.svm import SVC

In [267]:
svd = decomposition.TruncatedSVD(n_components=120)

In [268]:
svd.fit(X_train_tfidf)

TruncatedSVD(algorithm='randomized', n_components=120, n_iter=5,
       random_state=None, tol=0.0)

In [269]:
xtrain_svd = svd.transform(X_train_tfidf)
xtest_svd = svd.transform(X_test_tfidf)

In [270]:
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xtest_svd)

In [271]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [272]:
y_pred_svd = clf.predict(xvalid_svd_scl)

In [273]:
print(metrics.accuracy_score(y_test, y_pred_svd))

0.701736465781


In [274]:
X_tfidf = tfidf.fit_transform(X)
X_tfidf
svd.fit(X_tfidf)
x_svd = svd.transform(X_tfidf)

In [275]:
scl.fit(x_svd)
x_svd = scl.transform(x_svd)

In [276]:
clf.fit(x_svd, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [277]:
y_svd = clf.predict(x_svd)

In [278]:
print(metrics.accuracy_score(y, y_svd))

0.832064967567


In [279]:
test_tfidf = tfidf.transform(test['text'])
test_tfidf
test_svd = svd.transform(test_tfidf)
test_svd = scl.transform(test_svd)

In [280]:
y_test_svd = clf.predict(test_svd)

In [281]:
y_test_prob_svd = clf.predict_proba(test_svd)

In [282]:
result_svd = pd.DataFrame(y_test_prob_svd, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_svd.insert(0, 'id', test['id'])
result_svd.insert(1, 'text', test['text'])
result_svd.insert(2, 'Predicted',y_test_svd)
result_svd.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.070157,0.087195,0.842648
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.536291,0.40323,0.060479
2,id00134,And when they had broken down the frail door t...,1,0.230425,0.743198,0.026377
3,id27757,While I was thinking how I should possibly man...,0,0.624619,0.35726,0.018121
4,id04081,I am not sure to what limit his knowledge may ...,0,0.775576,0.128096,0.096328


In [283]:
result_svd['Predicted_Author'] = result_svd.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_svd.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.070157,0.087195,0.842648,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.536291,0.40323,0.060479,EAP
2,id00134,And when they had broken down the frail door t...,1,0.230425,0.743198,0.026377,HPL
3,id27757,While I was thinking how I should possibly man...,0,0.624619,0.35726,0.018121,EAP
4,id04081,I am not sure to what limit his knowledge may ...,0,0.775576,0.128096,0.096328,EAP


In [284]:
result_svd.to_excel(writer,sheet_name='SVC_TFIDF',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [285]:
#KNN - TFIDF

In [286]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 5,metric="minkowski",p=2)

In [287]:
KNN.fit(X_train_tfidf,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [288]:
y_pred_knn = KNN.predict(X_test_tfidf)

In [289]:
metrics.accuracy_score(y_test,y_pred_knn)

0.59346271705822262

In [290]:
X_tfidf = tfidf.fit_transform(X)
X_tfidf

<19579x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 248708 stored elements in Compressed Sparse Row format>

In [291]:
KNN.fit(X_tfidf,y)
y_knn = KNN.predict(X_tfidf)
metrics.accuracy_score(y, y_knn)

0.41411716635170337

In [292]:
test_tfidf = tfidf.transform(test['text'])
test_tfidf

<8392x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 65695 stored elements in Compressed Sparse Row format>

In [293]:
y_pred_knn = KNN.predict(test_tfidf)
y_pred_knn

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [294]:
y_pred_prob_knn = KNN.predict_proba(test_tfidf)
y_pred_prob_knn

array([[ 0.6,  0. ,  0.4],
       [ 0.6,  0.2,  0.2],
       [ 0.6,  0. ,  0.4],
       ..., 
       [ 0.8,  0. ,  0.2],
       [ 0.6,  0. ,  0.4],
       [ 0.6,  0. ,  0.4]])

In [295]:
result_knn = pd.DataFrame(y_pred_prob_knn, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_knn.insert(0, 'id', test['id'])
result_knn.insert(1, 'text', test['text'])
result_knn.insert(2, 'Predicted',y_pred_knn)
result_knn.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",0,0.6,0.0,0.4
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.6,0.2,0.2
2,id00134,And when they had broken down the frail door t...,0,0.6,0.0,0.4
3,id27757,While I was thinking how I should possibly man...,0,0.6,0.2,0.2
4,id04081,I am not sure to what limit his knowledge may ...,0,0.8,0.0,0.2


In [296]:
result_knn['Predicted_Author'] = result_knn.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_knn.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",0,0.6,0.0,0.4,EAP
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.6,0.2,0.2,EAP
2,id00134,And when they had broken down the frail door t...,0,0.6,0.0,0.4,EAP
3,id27757,While I was thinking how I should possibly man...,0,0.6,0.2,0.2,EAP
4,id04081,I am not sure to what limit his knowledge may ...,0,0.8,0.0,0.2,EAP


In [297]:
result_knn.to_excel(writer,sheet_name='KNN_TF-IDF',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [298]:
#Decision Tree- TF-IDF

In [299]:
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier(criterion='entropy',random_state=0)

In [300]:
decisiontree.fit(X_train_tfidf,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [301]:
y_pred_decision = decisiontree.predict(X_test_tfidf)

In [302]:
metrics.accuracy_score(y_test,y_pred_decision)

0.59346271705822262

In [303]:
X_tfidf = tfidf.fit_transform(X)
X_tfidf

<19579x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 248708 stored elements in Compressed Sparse Row format>

In [304]:
decisiontree.fit(X_tfidf,y)
y_decisiontree = decisiontree.predict(X_tfidf)
metrics.accuracy_score(y, y_decisiontree)

0.99969354921088927

In [305]:
test_tfidf = tfidf.transform(test['text'])
test_tfidf

<8392x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 65695 stored elements in Compressed Sparse Row format>

In [306]:
y_pred_decision = decisiontree.predict(test_tfidf)
y_pred_decision

array([2, 0, 1, ..., 2, 2, 0], dtype=int64)

In [307]:
y_pred_prob_decision = decisiontree.predict_proba(test_tfidf)
y_pred_prob_decision

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       ..., 
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]])

In [308]:
result_decision = pd.DataFrame(y_pred_prob_decision, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_decision.insert(0, 'id', test['id'])
result_decision.insert(1, 'text', test['text'])
result_decision.insert(2, 'Predicted',y_pred_decision)
result_decision.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.0,1.0
1,id24541,"If a fire wanted fanning, it could readily be ...",0,1.0,0.0,0.0
2,id00134,And when they had broken down the frail door t...,1,0.0,1.0,0.0
3,id27757,While I was thinking how I should possibly man...,2,0.0,0.0,1.0
4,id04081,I am not sure to what limit his knowledge may ...,0,1.0,0.0,0.0


In [309]:
result_decision['Predicted_Author'] = result_decision.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_decision.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.0,1.0,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,1.0,0.0,0.0,EAP
2,id00134,And when they had broken down the frail door t...,1,0.0,1.0,0.0,HPL
3,id27757,While I was thinking how I should possibly man...,2,0.0,0.0,1.0,MWS
4,id04081,I am not sure to what limit his knowledge may ...,0,1.0,0.0,0.0,EAP


In [310]:
result_decision.to_excel(writer,sheet_name='DecisionTree_TFIDF',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [311]:
#Random Forest - TF-IDF

In [312]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=15,criterion='entropy',random_state=0)

In [313]:
RFC.fit(X_train_tfidf,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [314]:
y_test_RFC=RFC.predict(X_test_tfidf)
metrics.accuracy_score(y_test,y_test_RFC)

0.68335035750766093

In [315]:
X_tfidf = tfidf.fit_transform(X)
X_tfidf

<19579x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 248708 stored elements in Compressed Sparse Row format>

In [316]:
RFC.fit(X_tfidf,y)
y_RFC = RFC.predict(X_tfidf)
metrics.accuracy_score(y, y_RFC)

0.99535216303181984

In [317]:
test_tfidf = tfidf.transform(test['text'])
test_tfidf

<8392x15455 sparse matrix of type '<class 'numpy.float64'>'
	with 65695 stored elements in Compressed Sparse Row format>

In [318]:
y_pred_RFC = RFC.predict(test_tfidf)
y_pred_RFC

array([2, 0, 0, ..., 0, 2, 0], dtype=int64)

In [319]:
y_pred_prob_RFC = RFC.predict_proba(test_tfidf)
y_pred_prob_RFC

array([[ 0.        ,  0.13333333,  0.86666667],
       [ 0.62222222,  0.13333333,  0.24444444],
       [ 0.46666667,  0.4       ,  0.13333333],
       ..., 
       [ 0.53333333,  0.06666667,  0.4       ],
       [ 0.2       ,  0.13333333,  0.66666667],
       [ 0.73333333,  0.13333333,  0.13333333]])

In [320]:
result_RFC = pd.DataFrame(y_pred_prob_RFC, columns=['EAP_Prob','HPL_Prob','MWS_Prob'])
result_RFC.insert(0, 'id', test['id'])
result_RFC.insert(1, 'text', test['text'])
result_RFC.insert(2, 'Predicted',y_pred_RFC)
result_RFC.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.133333,0.866667
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.622222,0.133333,0.244444
2,id00134,And when they had broken down the frail door t...,0,0.466667,0.4,0.133333
3,id27757,While I was thinking how I should possibly man...,1,0.266667,0.533333,0.2
4,id04081,I am not sure to what limit his knowledge may ...,0,0.866667,0.0,0.133333


In [321]:
result_RFC['Predicted_Author'] = result_RFC.Predicted.map({0:'EAP', 1:'HPL', 2:'MWS'})
result_RFC.head()

Unnamed: 0,id,text,Predicted,EAP_Prob,HPL_Prob,MWS_Prob,Predicted_Author
0,id02310,"Still, as I urged our leaving Ireland with suc...",2,0.0,0.133333,0.866667,MWS
1,id24541,"If a fire wanted fanning, it could readily be ...",0,0.622222,0.133333,0.244444,EAP
2,id00134,And when they had broken down the frail door t...,0,0.466667,0.4,0.133333,EAP
3,id27757,While I was thinking how I should possibly man...,1,0.266667,0.533333,0.2,HPL
4,id04081,I am not sure to what limit his knowledge may ...,0,0.866667,0.0,0.133333,EAP


In [322]:
result_RFC.to_excel(writer,sheet_name='RandomForest_TF-IDF',index=False,  columns=['id','text', 'Predicted_Author','EAP_Prob','HPL_Prob',
'MWS_Prob'])

In [323]:
writer.save()