# Spooky Author Identification


In [None]:
#Import Libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

In [None]:
# Reading Training, Testing dataset
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")


In [None]:
train.head(10)

In [None]:
# Number of records in Training dataset is 19579 records
train.shape[0]

In [None]:
# Lets check how the records are distributed against the authors in training dataset
sns.countplot(x="author", data=train, palette = "Greens_d")

In [None]:
#Lets create 3 different dataframes according to the Authors
train_eap = train[train.author=='EAP']
train_hpl = train[train.author=='HPL']
train_mws = train[train.author=='MWS']

In [None]:
#Wordcloud for EAP
from wordcloud import WordCloud, STOPWORDS
Alltext= (' '.join(train_eap['text']))
wc = WordCloud(width = 1000, height = 500,stopwords=STOPWORDS).generate(Alltext)

plt.figure(figsize=(15,5));
plt.imshow(wc);
plt.axis('off');
plt.title('Word Cloud for Edgar Allan Poe');

In [None]:
#Wordcloud for MWS
Alltext= (' '.join(train_mws['text']))
wc = WordCloud(width = 1000, height = 500,stopwords=STOPWORDS).generate(Alltext)

plt.figure(figsize=(15,5));
plt.imshow(wc);
plt.axis('off');
plt.title('Word Cloud for Mary Shelley');

In [None]:
#Wordcloud for HPL
Alltext= (' '.join(train_hpl['text']))
wc = WordCloud(width = 1000, height = 500,stopwords=STOPWORDS).generate(Alltext)

plt.figure(figsize=(15,5));
plt.imshow(wc);
plt.axis('off');
plt.title('Word Cloud for HP Lovecraft');

In [None]:
# lets create some features from the text columns on Train dataset
train["length"] = train["text"].apply(lambda x: len(str(x).split()))
test["length"] = test["text"].apply(lambda x: len(str(x).split()))

In [None]:
print(train.groupby(by=['author'])['length'].mean())
# the mean length of text for each author is almost same for HPL and MWS.

In [None]:
# Lets start extracting features from NLTK packages
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
eng_stopwords = set(stopwords.words("english"))
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus_text = []
#train.shape[0]
for i in range(0, train.shape[0]):
    corpus = train["text"][i]
    corpus = corpus.lower()
    corpus = corpus.split()
    ps = PorterStemmer()
    corpus = [ps.stem(word) for word in corpus if not word in eng_stopwords]
    corpus = ' '.join(corpus)
    corpus_text.append(corpus)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)

In [None]:
X = cv.fit_transform(corpus_text).toarray()
y = train.iloc[:, 2].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# Lets use navies bayes classified (MultinomialNB)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [None]:
ACC

Since we have decent accuracy let me try to create the prediction on test output file.

In [None]:
# Run the NLTK on Test dataset to create the corpus
test_corpus_text = []
for i in range(0, test.shape[0]):
    corpus = test["text"][i]
    corpus = corpus.lower()
    corpus = corpus.split()
    ps = PorterStemmer()
    corpus = [ps.stem(word) for word in corpus if not word in eng_stopwords]
    corpus = ' '.join(corpus)
    test_corpus_text.append(corpus)

In [None]:
X_test_output = cv.transform(test_corpus_text).toarray()

In [None]:
y_prob_output = classifier.predict_proba(X_test_output)

In [None]:
y_prob_output

In [None]:
submission_df = pd.DataFrame(y_prob_output,index=test['id'],columns=['EAP','HPL','MWS'])

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv")

Let me try decision tree using the same data and compare the accuracy

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 1007)

In [None]:
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [None]:
ACC

It looks like Multilogistic reg algorithm  is doing much better than the random forest, stay tuned, i would be adding more classifier like SVM, Decision tree and xgboost. Please upvote to encourage!!