Spooky Author Identification

# Load data

In [66]:
# import libraries
import pandas as pd
import numpy as np

# to make this notebook's output stable across runs
np.random.seed(42)
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import re

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  

# set options for rendering plots
%matplotlib inline

# display multiple outputs within a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all";

import warnings
warnings.filterwarnings('ignore');

In [67]:
train = pd.read_csv("C:/Users/1394852/Desktop/ml_project/spooky_author/train.csv")
test = pd.read_csv("C:/Users/1394852/Desktop/ml_project/spooky_author/test.csv")

# Preprocessing and feature extraction

## stop words and tokens

In [68]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
data = train["text"]
  
stop_words = set(stopwords.words('english')) 
  
word_tokens = word_tokenize(data[1]) 
  
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
print(word_tokens) 
print(filtered_sentence)

['It', 'never', 'once', 'occurred', 'to', 'me', 'that', 'the', 'fumbling', 'might', 'be', 'a', 'mere', 'mistake', '.']
['It', 'never', 'occurred', 'fumbling', 'might', 'mere', 'mistake', '.']


In [69]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['this', 'process', 'however', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon', 'as', 'might', 'make', 'its', 'circuit', 'and', 'return', 'to', 'the', 'point', 'whence', 'set', 'out', 'without', 'being', 'aware', 'of', 'the', 'fact', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall']]


In [70]:
# flatten list and join together as a string
flat_list = [item for sublist in data_words for item in sublist]
str1 = ' '.join(flat_list)

## bigram, trigram

In [71]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['this', 'process', 'however', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon', 'as', 'might', 'make', 'its', 'circuit', 'and', 'return', 'to', 'the', 'point', 'whence', 'set', 'out', 'without', 'being', 'aware', 'of', 'the', 'fact', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall']


In [72]:
bigrams = []
for phrase in bigram.export_phrases(data_words[:100]):
    bigrams.append(phrase)
bigrams[:10]

[(b'herbert west', 1073.8432406519655),
 (b'von kempelen', 2514.070707070707),
 (b'cut off', 178.57336134884434),
 (b'old bugs', 183.65776269185358),
 (b'few moments', 112.51943942133815),
 (b'no longer', 104.39493879326415),
 (b'hill noises', 279.6550561797753),
 (b'next morning', 196.21048482459597),
 (b'twenty four', 191.9359938307307),
 (b'ex queen', 2154.917748917749)]

In [73]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

train_text = remove_stopwords(train['text'])
test_text = remove_stopwords(test['text'])

## bag of words

In [74]:
train_text = [' '.join(sent) for sent in train_text]
test_text = [' '.join(sent) for sent in test_text]

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 6000)

feature_vec = vectorizer.fit_transform(train_text)

## tf-idf

In [76]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(feature_vec)
X_train_tf = tf_transformer.transform(feature_vec)
X_train_tf.shape

(19579, 6000)

# Training Classifier Models

In [77]:
# create train/test set
train_data = train_text
train_labels = train["author"]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(train_data,train_labels,test_size=0.20,random_state=0)

In [78]:
# feature processing pipeline
from sklearn.pipeline import Pipeline

text_features = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
])

text_features.fit_transform(X_train)

<15663x22935 sparse matrix of type '<class 'numpy.float64'>'
	with 199628 stored elements in Compressed Sparse Row format>

# Multinomial Naive Bayes

In [86]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, log_loss

pipe = Pipeline([
    ('features', text_features),
    ('clf', MultinomialNB()),
])

pipe.fit(X_train, y_train)

nb_pred = pipe.predict(X_test)
nb_probs = pipe.predict_proba(X_test)

print("Accuracy score: " + str(accuracy_score(y_test, nb_pred)))
print("Log loss: " + str(log_loss(y_test, nb_probs)));

Accuracy score: 0.8138406537282942
Log loss: 0.596960435695908


In [16]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'features', 'clf', 'features__memory', 'features__steps', 'features__vect', 'features__tfidf', 'features__vect__analyzer', 'features__vect__binary', 'features__vect__decode_error', 'features__vect__dtype', 'features__vect__encoding', 'features__vect__input', 'features__vect__lowercase', 'features__vect__max_df', 'features__vect__max_features', 'features__vect__min_df', 'features__vect__ngram_range', 'features__vect__preprocessor', 'features__vect__stop_words', 'features__vect__strip_accents', 'features__vect__token_pattern', 'features__vect__tokenizer', 'features__vect__vocabulary', 'features__tfidf__norm', 'features__tfidf__smooth_idf', 'features__tfidf__sublinear_tf', 'features__tfidf__use_idf', 'clf__alpha', 'clf__class_prior', 'clf__fit_prior'])

In [92]:
# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
log_loss_build = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

parameters = {'features__vect__ngram_range': [(1,1), (1,2), (1,3)],
              'features__tfidf__use_idf': [False],
              'clf__alpha': [0.01]
             }

gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, scoring=log_loss_build)
 
# Fit and tune model
gs.fit(X_train, y_train);

In [93]:
gs.best_params_
gs.best_score_
final_model = gs.best_estimator_

final_model.fit(X_train, y_train)
final_pred = final_model.predict(X_test)
final_probs = final_model.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, final_pred)))
print("Log loss: " + str(log_loss(y_test, final_probs)))

{'clf__alpha': 0.01,
 'features__tfidf__use_idf': False,
 'features__vect__ngram_range': (1, 2)}

-0.4127716645707371

Pipeline(memory=None,
     steps=[('features', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), pr...,
         use_idf=False))])), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

Accuracy score: 0.8281409601634321
Log loss: 0.414406729221749


# Complement NB

In [84]:
from sklearn.naive_bayes import ComplementNB

pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', ComplementNB(alpha=0.01))
])

pipe.fit(X_train, y_train)
cnb_pred = pipe.predict(X_test)
cnb_probs = pipe.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, cnb_pred)))
print("Log loss: " + str(log_loss(y_test, cnb_probs)));

Accuracy score: 0.8220122574055159
Log loss: 0.5088275380645537


# Random Forest

In [114]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', RandomForestClassifier(n_estimators = 500)) 
])

pipe.fit(X_train, y_train)
rf_pred = pipe.predict(X_test)
rf_probs = pipe.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, rf_pred)))
print("Log loss: " + str(log_loss(y_test, rf_probs)));

Accuracy score: 0.7229315628192032
Log loss: 0.7015590986473873


In [77]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__bootstrap', 'clf__class_weight', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])

In [115]:
# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
log_loss_build = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

parameters = {'clf__n_estimators': [500]}

gs = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, scoring=log_loss_build)
 
# Fit and tune model
gs.fit(X_train, y_train);

In [98]:
gs.best_params_
gs.best_score_
final_rf = gs.best_estimator_

final_rf.fit(X_train, y_train)
rf_pred = final_rf.predict(X_test)
rf_probs = final_rf.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, rf_pred)))
print("Log loss: " + str(log_loss(y_test, rf_probs)))

{'clf__max_depth': 50, 'clf__max_leaf_nodes': 36, 'clf__n_estimators': 500}

-0.9637867737706388

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

Accuracy score: 0.598314606741573
Log loss: 0.966775899713934


In [116]:
final_rf = gs.best_estimator_

# Logistic Regression with Stochastic Gradient Descent

In [102]:
from sklearn.linear_model import SGDClassifier

pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', SGDClassifier(loss='log')) 
])

pipe.fit(X_train, y_train)
lr_pred = pipe.predict(X_test)
lr_probs = pipe.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, lr_pred)))
print("Log loss: " + str(log_loss(y_test, lr_probs)));

Accuracy score: 0.7668539325842697
Log loss: 0.678033269518954


In [98]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__alpha', 'clf__average', 'clf__class_weight', 'clf__early_stopping', 'clf__epsilon', 'clf__eta0', 'clf__fit_intercept', 'clf__l1_ratio', 'clf__learning_rate', 'clf__loss', 'clf__max_iter', 'clf__n_iter', 'clf__n_iter_no_change', 'clf__n_jobs', 'clf__penalty', 'clf__power_t', 'clf__random_state', 'clf__shuffle', 'clf__tol', 'clf__validation_fraction', 'clf__verbose', 'clf__warm_start'])

In [103]:
# Grid search
alpha_range = 10.0**-np.arange(1,7)

parameters = {'clf__alpha': alpha_range,
              'clf__penalty': ['l1', 'l2'],
              'clf__max_iter': [10, 50]
             }

gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, scoring=log_loss_build)
 
# Fit and tune model
gs.fit(X_train, y_train);

gs.best_params_
lr_model = gs.best_estimator_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'clf__alpha': array([1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06]), 'clf__penalty': ['l1', 'l2'], 'clf__max_iter': [10, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True),
       verbose=0)

{'clf__alpha': 1e-05, 'clf__max_iter': 10, 'clf__penalty': 'l2'}

In [104]:
gs.best_params_
gs.best_score_
lr_model = gs.best_estimator_

lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_probs = lr_model.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, lr_pred)))
print("Log loss: " + str(log_loss(y_test, lr_probs)))

{'clf__alpha': 1e-05, 'clf__max_iter': 10, 'clf__penalty': 'l2'}

-0.5122414017968706

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

Accuracy score: 0.8089887640449438
Log loss: 0.5140868408176312


# Ensemble Classifiers

In [106]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                         algorithm="SAMME",
                         n_estimators=600)

pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', ada) 
])

pipe.fit(X_train, y_train)
ada_pred = pipe.predict(X_test)
ada_probs = pipe.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, ada_pred)))
print("Log loss: " + str(log_loss(y_test, ada_probs)));

Accuracy score: 0.5730337078651685
Log loss: 1.0958770787929413


In [110]:
from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(n_estimators=500)),
    ('lr', SGDClassifier(loss='log', alpha=1e-05, max_iter=10)),
     ('nb', MultinomialNB(alpha=0.01))], voting='soft')

pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=False))
])

pipe.fit_transform(X_train)
eclf1.fit(X_train, y_train)
vote_pred = eclf1.predict(X_test)
vote_probs = eclf1.predict_proba(X_test);

print("Accuracy score: " + str(accuracy_score(y_test, vote_pred)))
print("Log loss: " + str(log_loss(y_test, vote_probs)));

ValueError: could not convert string to float: 'beginning season quitted vacant city vienna unable tame haughty mind anything like submission delayed hamburgh last came london many weeks elapsed gave adrian notice arrival'

In [117]:
lr_pred = lr_model.predict_proba(X_test)
rf_pred = final_rf.predict_proba(X_test)
nb_pred = final_model.predict_proba(X_test)

In [122]:
vote_pred = (lr_pred + rf_pred + nb_pred) / 3
print("Log loss: " + str(log_loss(y_test, vote_pred)));

Log loss: 0.48864143523330944


In [130]:
y_test = np.array(y_test).reshape(3916, 1)
new_ft = np.hstack((rf_pred, y_test))
rf_pred = final_rf.predict_proba(X_train)

In [133]:
stack = SGDClassifier(loss='log').fit(rf_pred, y_train)
stack_pred = stack.predict_proba(X_test)
print("Log loss: " + str(log_loss(y_test, stack_pred)));

ValueError: Expected 2D array, got 1D array instead:
array=['seemed void nothing felt childish fear prompted draw hip pocket revolver always carried dark since night held east providence'
 'event caused many sail put foot firm land ready encounter evil rather rush yawning jaws pitiless ocean'
 'hastily gave consent arrangement hastily formed plan perdita forced become companion'
 ... 'copy taken demanded surveying microscope'
 'wish go sleep room alone especially since thought glimpsed evening twilight repellent old woman whose image become horribly transferred dreams'
 'would rash wager wager one thousand one day never passed heads boys without finding least one ensconced umbrageous hall enthroned upon natural throne'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [59]:
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import cross_val_score
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', n_estimators=200)

pipe = Pipeline([
    ('vect', CountVectorizer(max_features=10000, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', xgb_clf)
])

scores = cross_val_score(pipe, X_train, y_train, cv=5, n_jobs=5, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.853 +- 0.008


In [63]:
# grid search xgboost
xgb_clf = xgb.XGBClassifier(objective='multi:softprob')

pipe = Pipeline([
    ('vect', CountVectorizer(max_features=15000, ngram_range=(1,1))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', xgb_clf)
])

parameters = {'clf__n_estimators': [200],
              'clf__max_depth': [5],
              'clf__subsample': [0.5],
              'clf__colsample_bytree': [0.7, 0.825, 0.95]
}

gs = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, verbose=1, scoring='neg_log_loss', refit=True)
 
# Fit and tune model
gs.fit(X_train, y_train);

gs.best_params_
xgb_model = gs.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=15000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'clf__n_estimators': [200], 'clf__max_depth': [5], 'clf__subsample': [0.5], 'clf__colsample_bytree': [0.7, 0.825, 0.95]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=1)

{'clf__colsample_bytree': 0.95,
 'clf__max_depth': 5,
 'clf__n_estimators': 200,
 'clf__subsample': 0.5}

In [64]:
gs.best_score_

-0.7950971699823703

# Final submission

In [134]:
X_test = test_text
predictions = final_model.predict_proba(X_test)

In [135]:
preds = pd.DataFrame(data=predictions, columns = final_model.named_steps['clf'].classes_)

In [136]:
# generating a submission file
result = pd.concat([test[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.159072,0.021447,0.81948
id24541,0.944975,0.014831,0.040194
id00134,0.299065,0.69745,0.003485
id27757,0.777884,0.221007,0.001108
id04081,0.890424,0.027232,0.082344


In [137]:
result.to_csv("C:/Users/1394852/Desktop/ml_project/spooky_author/spooky_pred_6.csv")