In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spooky-author-identification/train.zip
/kaggle/input/spooky-author-identification/sample_submission.zip
/kaggle/input/spooky-author-identification/test.zip


In [2]:
!unzip '/kaggle/input/spooky-author-identification/train.zip'
!unzip '/kaggle/input/spooky-author-identification/test.zip'
!unzip '/kaggle/input/spooky-author-identification/sample_submission.zip'

Archive:  /kaggle/input/spooky-author-identification/train.zip
  inflating: train.csv               
Archive:  /kaggle/input/spooky-author-identification/test.zip
  inflating: test.csv                
Archive:  /kaggle/input/spooky-author-identification/sample_submission.zip
  inflating: sample_submission.csv   


In [3]:
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/train.csv
/kaggle/working/sample_submission.csv
/kaggle/working/test.csv
/kaggle/working/__notebook_source__.ipynb


In [4]:
import numpy as np
import pandas as pd
import time
import gc

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing, model_selection, metrics, decomposition

In [5]:
# read data
train = pd.read_csv('/kaggle/working/train.csv')
test = pd.read_csv('/kaggle/working/test.csv')

train.head(10)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [6]:
test.head(5)

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


Use label encoder to encode the text labels to integers

In [7]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(train['author'])

In [8]:
X = train['text']
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size=0.1, random_state= 2020, shuffle=True, stratify = train['author'])
print(X_train.shape)
print(X_valid.shape)

(17621,)
(1958,)


### Some base models first

In [9]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vec = TfidfVectorizer(min_df = 3, max_df = 0.8, analyzer='word', ngram_range =(1,3), token_pattern = r'\w{1,}',
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

# fit to both train and valid sets
tfidf_vec.fit(X_train.values.tolist() + X_valid.values.tolist())

# transformed
X_train_tfv = tfidf_vec.transform(X_train.values.tolist())
X_valid_tfv = tfidf_vec.transform(X_valid.values.tolist())

CPU times: user 7.16 s, sys: 225 ms, total: 7.39 s
Wall time: 7.41 s


Fit a Logisitic Regression model on tfidf

In [10]:
%%time
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.9)

logreg.fit(X_train_tfv, y_train)
predictions = logreg.predict_proba(X_valid_tfv)

print('Log loss using Logisitic Regression on Tfidf Vectorizer is : ', metrics.log_loss(y_valid, predictions))

Log loss using Logisitic Regression on Tfidf Vectorizer is :  0.5664506284140641
CPU times: user 7.49 s, sys: 119 ms, total: 7.61 s
Wall time: 3.96 s


### Use `wordcount` as features  instead of TFIDF using CountVectorizer

In [11]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(min_df = 3, max_df = 0.8, ngram_range=(1,3), stop_words='english', analyzer='word', token_pattern = r'\w{1,}')

count_vec.fit(X_train.values.tolist() + X_valid.values.tolist())
X_train_cv = count_vec.transform(X_train.values.tolist())
X_valid_cv = count_vec.transform(X_valid.values.tolist())

CPU times: user 4.04 s, sys: 68.6 ms, total: 4.11 s
Wall time: 3.93 s


Fit a simple Logisitc regression model on Count Vectorizer

In [12]:
%%time
logreg.fit(X_train_cv, y_train)
prediction = logreg.predict_proba(X_valid_cv)
print('Log loss using Logisitic Regression on Count Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Logisitic Regression on Count Vectorizer is :  0.47578347792652315
CPU times: user 2.59 s, sys: 31.8 ms, total: 2.62 s
Wall time: 1.34 s


### Naive Bayes model on Count Vectorizer

In [13]:
%%time
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_cv, y_train)
prediction = mnb.predict_proba(X_valid_cv)
print('Log loss using Multinomial NB on Count Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Multinomial NB on Count Vectorizer is :  0.45400575892684447
CPU times: user 25.7 ms, sys: 2.03 ms, total: 27.8 ms
Wall time: 13.8 ms


### Naive Bayes model on Tfidf Vectorizer

In [14]:
%%time
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_tfv, y_train)
prediction = mnb.predict_proba(X_valid_tfv)
print('Log loss using Multinomial NB on Tfidf Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Multinomial NB on Tfidf Vectorizer is :  0.565051212698791
CPU times: user 40.1 ms, sys: 1.04 ms, total: 41.1 ms
Wall time: 20 ms


### SVM(Support Vector Machine) on TFIDF - using SVD
Since SVM takes a lot of time to on this high dimensional dataset, we will reduce the dimension using SVD (Singular Value Decomposition) befor applying SVM

Also it is important to standardise the data prior to applying SVM

In [15]:
%%time

from sklearn.decomposition import TruncatedSVD

n_comp = 20

svd = TruncatedSVD(n_components = n_comp)
# fit the SVD on tf-id vector
svd.fit(X_train_tfv)
X_train_tf_svd = svd.transform(X_train_tfv)
X_valid_tf_svd = svd.transform(X_valid_tfv)

# scale the data prior to applying SVM
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_tf_svd)
X_train_tf_svd_scaled = scaler.transform(X_train_tf_svd)
X_valid_tf_svd_scaled = scaler.transform(X_valid_tf_svd)

CPU times: user 1.19 s, sys: 40.1 ms, total: 1.23 s
Wall time: 657 ms


Apply a simple SVM classifier

In [16]:
%%time
from sklearn.svm import SVC

svc = SVC(C=0.9, probability = True)
svc.fit(X_train_tf_svd_scaled, y_train)
prediction = svc.predict_proba(X_valid_tf_svd_scaled)
print('Log loss using SVC on TF-IDF Vectorizer with SVD is : ', metrics.log_loss(y_valid, prediction))

Log loss using SVC on TF-IDF Vectorizer with SVD is :  0.8541441385552606
CPU times: user 2min, sys: 2.18 s, total: 2min 2s
Wall time: 2min 2s


### Using XGboost on the data

Fit the model on the original high dimension tf-idf vector. The vector will be compressed into 'csc' or 'csr' format before applying the fit method of xgboost

For more details on sparse matrices, refer [here](https://rushter.com/blog/scipy-sparse-matrices/)
 * CSR - Compressed Sparse Row - usually used when the number of rows is less than the number of columns
 * CSC - Compressed Sparse Column - usually when there are lesser number of columns than rows

In [17]:
%%time

import xgboost as xgb
from scipy.sparse import csr_matrix # to convert the input into 

xgb_clf = xgb.XGBClassifier(n_estimators = 200, max_depth =7, learning_rate = 0.1, verbose = 2, colsample_bytree = 0.8, subsample =0.8, n_jobs=-1, nthread=10)

# convert the input into Compressed Sparse Column format
xgb_clf.fit(X_train_tfv.tocsc(), y_train)
predictions = xgb_clf.predict_proba(X_valid_tfv.tocsc())

print('Log loss using Xgboost on the original TF-IDF Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Xgboost on the original TF-IDF Vectorizer is :  0.8541441385552606
CPU times: user 6min 17s, sys: 5min 30s, total: 11min 48s
Wall time: 3min 2s


### Using Xgboost on the CountVectorizer

In [18]:
%%time
# convert the input into Compressed Sparse Column format

xgb_clf = xgb.XGBClassifier(n_estimators = 200, max_depth =7, learning_rate = 0.1, verbose = 2, colsample_bytree = 0.8, subsample =0.8, n_jobs=-1, nthread=10)

xgb_clf.fit(X_train_cv.tocsc(), y_train)
predictions = xgb_clf.predict_proba(X_valid_cv.tocsc())

print('Log loss using Xgboost on the original Count Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Xgboost on the original Count Vectorizer is :  0.8541441385552606
CPU times: user 1min 27s, sys: 1min 56s, total: 3min 23s
Wall time: 52.1 s


### Using Xgboost on the Tf-idf SVD features

In [26]:
%%time
# convert the input into Compressed Sparse Column format

xgb_clf.fit(X_train_tf_svd, y_train)
predictions = xgb_clf.predict_proba(X_valid_tf_svd)

print('Log loss using Xgboost on the original Count Vectorizer is : %0.3f' % metrics.log_loss(y_valid, prediction))

Log loss using Xgboost on the original Count Vectorizer is : 0.854
CPU times: user 3.72 ms, sys: 1.13 ms, total: 4.85 ms
Wall time: 3.34 ms


### Using GridSearch

In [29]:
# create a scoring function if you using a custom scorer

logloss_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better = False, needs_proba = True)

Create a pipeline

In [21]:
from sklearn.pipeline import Pipeline

# intialize SVD
svd = TruncatedSVD()

# initialize Standard Scaler
scaler = preprocessing.StandardScaler()

# logistic regression
logreg = LogisticRegression()

# Create a pipeline with Logistic Regression as the final estimator

pipe1 = Pipeline([
                    ('svd', svd),
                    ('scaler', scaler),
                    ('logreg', logreg)
                ])

* To get a view of the steps in the pipeline, use **pipe.named_steps()**
* To access the parameters in the pipelines, use **pipe.get_params()**

In [22]:
pipe1.get_params()

{'memory': None,
 'steps': [('svd',
   TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
                random_state=None, tol=0.0)),
  ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('logreg',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'svd': TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
              random_state=None, tol=0.0),
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'logreg': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l

Now set the values for the parameters in the grid. Define a dictionary for the same

In [23]:
param_grid = {
                'svd__n_components': [120, 180],
                'logreg__C': [0.1, 1.0, 10],
                'logreg__penalty': ['l2', 'l1']
            }

When using Grid Search,
1. To find the best score based on the scoring function: ** model.best_score_**
2. Best set of parameters: ** model.best_estimator_.get_params() **

In [31]:
%%time

# Intialize GridSearch Model

from sklearn.model_selection import GridSearchCV

# setting refit = True, takes the best parameters when traininf from the folds and retrains the model on the entire data using those best parameters
model = GridSearchCV(estimator = pipe1, param_grid = param_grid, scoring = logloss_scorer, cv = 2, refit= True, verbose=10)

# fit the gridsearch model. We can fit on the entire train, but here I will use only X_train with tf idf
model.fit(X_train_tfv, y_train)

print('Best score is : %0.3f' % model.best_score_)
print('Best parameters set:')
best_parameters = model.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" %(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=120, score=-0.733, total=   2.9s
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=120, score=-0.736, total=   3.0s
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=180, score=-0.694, total=   4.6s
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.5s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=180, score=-0.687, total=   4.5s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   15.0s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=120, score=nan, total=   2.8s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.8s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=120, score=nan, total=   2.8s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   20.6s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=180, score=nan, total=   4.3s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.0s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=180, score=nan, total=   4.3s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   29.2s remaining:    0.0s


[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=120, score=-0.736, total=   2.9s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   32.1s remaining:    0.0s


[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=120, score=-0.731, total=   2.9s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=180, score=-0.691, total=   4.5s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=180, score=-0.688, total=   4.5s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=120 ........
[CV]  logreg__C=1.0, logreg__penalty=l1, svd__n_components=120, score=nan, total=   2.8s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=120 ........
[CV]  logreg__C=1.0, logreg__penalty=l1, svd__n_components=120, score=nan, total=   2.8s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__penalty=l1, svd__n_components=180, score=nan, total=   4.3s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__pe

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  1.5min finished


Best score is : -0.690
Best parameters set:
	logreg__C: 10
	logreg__penalty: 'l2'
	svd__n_components: 180
CPU times: user 2min 12s, sys: 27.9 s, total: 2min 40s
Wall time: 1min 33s


We will do the same using MultinomialNB model on tf-idf data

In [37]:
%%time

nb_model = MultinomialNB()

# create a pipeline

pipe2 = Pipeline([('nb', nb_model)])

# parameter grid
# try with the alpha parameter
# pipe2.get_params()

param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# intialize the Grid SearchCV model
model = GridSearchCV(estimator = pipe2, param_grid = param_grid, cv = 2, refit = True, scoring = logloss_scorer, verbose=10, n_jobs=-1)

model.fit(X_train_tfv, y_train)
print("best score is :",format(model.best_score_))
print('best parameters are: ')
best_parameters = model.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print('\t%s: %r' %(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
best score is : -0.4518777178062823
best parameters are: 
/tnb__alpha: 0.1
CPU times: user 66 ms, sys: 8.33 ms, total: 74.3 ms
Wall time: 193 ms


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0587s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.2s finished
