# Spooky Author Identification


Kaggle Project

https://www.kaggle.com/c/spooky-author-identification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ls data

sample_submission.csv  test.csv               train.csv


In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
df.shape

(19579, 3)

In [5]:
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [6]:
df_test = pd.read_csv('data/test.csv')

In [7]:
df_test.shape

(8392, 2)

In [8]:
df_test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [9]:
df.isnull().sum()

id        0
text      0
author    0
dtype: int64

In [10]:
df.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

## Test TF-IDF Model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

X = df.text
y = df.author

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)

y_pred = text_clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         EAP       0.82      0.85      0.83      1570
         HPL       0.83      0.82      0.82      1071
         MWS       0.85      0.83      0.84      1275

    accuracy                           0.83      3916
   macro avg       0.83      0.83      0.83      3916
weighted avg       0.83      0.83      0.83      3916



## Test CountVector

In [13]:
text_clf = Pipeline([('vectorizer',CountVectorizer()),
                    ('clf',LinearSVC())])

text_clf.fit(X_train,y_train)
y_pred = text_clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         EAP       0.79      0.84      0.81      1570
         HPL       0.82      0.76      0.79      1071
         MWS       0.82      0.80      0.81      1275

    accuracy                           0.80      3916
   macro avg       0.81      0.80      0.80      3916
weighted avg       0.80      0.80      0.80      3916





In [None]:
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

svm = LinearSVC()
clf = CalibratedClassifierCV(svm) 

clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)

In [None]:
x1 = vector.fit_transform(df_test.text)
X1 = x1.toarray()

In [None]:
X1.shape

## Combine Train and Test for TF-IDF

In [18]:
df_combine = df.text.append(df_test.text).reset_index(drop=True)
x_combine = vector.fit_transform(df_combine)
X_combine = x_combine.toarray()

In [19]:
df_combine.shape

(27971,)

In [20]:
X_train = X_combine[:19579,]
y_train = df.author
print(X_train.shape)
print(y_train.shape)

(19579, 28300)
(19579,)


In [21]:
from sklearn.calibration import CalibratedClassifierCV, calibration_curve


svm = LinearSVC()
clf = CalibratedClassifierCV(svm) 

clf.fit(X_train, y_train)

X_test = X_combine[19579:,]
y_proba = clf.predict_proba(X_test)



In [22]:
y_proba

array([[0.2963749 , 0.01893783, 0.68468727],
       [0.86554889, 0.12853342, 0.0059177 ],
       [0.13378309, 0.84854874, 0.01766817],
       ...,
       [0.81650439, 0.07109684, 0.11239877],
       [0.36871122, 0.01101319, 0.6202756 ],
       [0.34461246, 0.65397625, 0.00141128]])

In [24]:
dataset = pd.DataFrame({'EAP': y_proba[:, 0], 'HPL': y_proba[:, 1],'MWS': y_proba[:, 2]})
dataset.head()

Unnamed: 0,EAP,HPL,MWS
0,0.296375,0.018938,0.684687
1,0.865549,0.128533,0.005918
2,0.133783,0.848549,0.017668
3,0.801297,0.182296,0.016408
4,0.810728,0.033744,0.155528


In [25]:
df_test['EAP'] = y_proba[:, 0]
df_test['HPL'] = y_proba[:, 1]
df_test['MWS'] = y_proba[:, 2]

In [27]:
# This is for submission

df_test.drop('text', axis=1).to_csv('submission.csv', index=False)