In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/glove-840b300d-dj/glove.840B.300d.txt
/kaggle/input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl
/kaggle/input/spooky-author-identification/test.zip
/kaggle/input/spooky-author-identification/train.zip
/kaggle/input/spooky-author-identification/sample_submission.zip


In [2]:
!unzip '/kaggle/input/spooky-author-identification/train.zip'
!unzip '/kaggle/input/spooky-author-identification/test.zip'
!unzip '/kaggle/input/spooky-author-identification/sample_submission.zip'

Archive:  /kaggle/input/spooky-author-identification/train.zip
  inflating: train.csv               
Archive:  /kaggle/input/spooky-author-identification/test.zip
  inflating: test.csv                
Archive:  /kaggle/input/spooky-author-identification/sample_submission.zip
  inflating: sample_submission.csv   


In [3]:
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/train.csv
/kaggle/working/test.csv
/kaggle/working/__notebook_source__.ipynb
/kaggle/working/sample_submission.csv


In [4]:
import numpy as np
import pandas as pd
import time
import gc
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing, model_selection, metrics, decomposition

import logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [5]:
# read data
train = pd.read_csv('/kaggle/working/train.csv')
test = pd.read_csv('/kaggle/working/test.csv')

train.head(10)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [6]:
test.head(5)

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


Use label encoder to encode the text labels to integers

In [7]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(train['author'])

In [8]:
X = train['text']
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size=0.1, random_state= 2020, shuffle=True, stratify = train['author'])
print(X_train.shape)
print(X_valid.shape)

(17621,)
(1958,)


### Some base models first

In [9]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vec = TfidfVectorizer(min_df = 3, max_df = 0.8, analyzer='word', ngram_range =(1,3), token_pattern = r'\w{1,}',
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

# fit to both train and valid sets
tfidf_vec.fit(X_train.values.tolist() + X_valid.values.tolist())

# transformed
X_train_tfv = tfidf_vec.transform(X_train.values.tolist())
X_valid_tfv = tfidf_vec.transform(X_valid.values.tolist())

CPU times: user 8.39 s, sys: 207 ms, total: 8.6 s
Wall time: 8.63 s


Fit a Logisitic Regression model on tfidf

In [10]:
%%time
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.9)

logreg.fit(X_train_tfv, y_train)
predictions = logreg.predict_proba(X_valid_tfv)

print('Log loss using Logisitic Regression on Tfidf Vectorizer is : ', metrics.log_loss(y_valid, predictions))

Log loss using Logisitic Regression on Tfidf Vectorizer is :  0.5664610734198464
CPU times: user 8.58 s, sys: 136 ms, total: 8.72 s
Wall time: 4.52 s


### Use `wordcount` as features  instead of TFIDF using CountVectorizer

In [11]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(min_df = 3, max_df = 0.8, ngram_range=(1,3), stop_words='english', analyzer='word', token_pattern = r'\w{1,}')

count_vec.fit(X_train.values.tolist() + X_valid.values.tolist())
X_train_cv = count_vec.transform(X_train.values.tolist())
X_valid_cv = count_vec.transform(X_valid.values.tolist())

CPU times: user 4.64 s, sys: 70.4 ms, total: 4.71 s
Wall time: 4.53 s


Fit a simple Logisitc regression model on Count Vectorizer

In [12]:
%%time
logreg.fit(X_train_cv, y_train)
prediction = logreg.predict_proba(X_valid_cv)
print('Log loss using Logisitic Regression on Count Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Logisitic Regression on Count Vectorizer is :  0.4757834779740475
CPU times: user 3.22 s, sys: 44.7 ms, total: 3.27 s
Wall time: 1.67 s


### Naive Bayes model on Count Vectorizer

In [13]:
%%time
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_cv, y_train)
prediction = mnb.predict_proba(X_valid_cv)
print('Log loss using Multinomial NB on Count Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Multinomial NB on Count Vectorizer is :  0.45400575892684447
CPU times: user 29.8 ms, sys: 3.68 ms, total: 33.4 ms
Wall time: 16.8 ms


### Naive Bayes model on Tfidf Vectorizer

In [14]:
%%time
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_tfv, y_train)
prediction = mnb.predict_proba(X_valid_tfv)
print('Log loss using Multinomial NB on Tfidf Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Multinomial NB on Tfidf Vectorizer is :  0.565051212698791
CPU times: user 45.8 ms, sys: 1.99 ms, total: 47.8 ms
Wall time: 23 ms


### SVM(Support Vector Machine) on TFIDF - using SVD
Since SVM takes a lot of time to on this high dimensional dataset, we will reduce the dimension using SVD (Singular Value Decomposition) befor applying SVM

Also it is important to standardise the data prior to applying SVM

In [15]:
%%time

from sklearn.decomposition import TruncatedSVD

n_comp = 20

svd = TruncatedSVD(n_components = n_comp)
# fit the SVD on tf-id vector
svd.fit(X_train_tfv)
X_train_tf_svd = svd.transform(X_train_tfv)
X_valid_tf_svd = svd.transform(X_valid_tfv)

# scale the data prior to applying SVM
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_tf_svd)
X_train_tf_svd_scaled = scaler.transform(X_train_tf_svd)
X_valid_tf_svd_scaled = scaler.transform(X_valid_tf_svd)

CPU times: user 1.57 s, sys: 40.8 ms, total: 1.61 s
Wall time: 857 ms


Apply a simple SVM classifier

In [16]:
%%time
from sklearn.svm import SVC

svc = SVC(C=0.9, probability = True)
svc.fit(X_train_tf_svd_scaled, y_train)
prediction = svc.predict_proba(X_valid_tf_svd_scaled)
print('Log loss using SVC on TF-IDF Vectorizer with SVD is : ', metrics.log_loss(y_valid, prediction))

Log loss using SVC on TF-IDF Vectorizer with SVD is :  0.8525357164606708
CPU times: user 2min 2s, sys: 1.35 s, total: 2min 3s
Wall time: 2min 3s


### Using XGboost on the data

Fit the model on the original high dimension tf-idf vector. The vector will be compressed into 'csc' or 'csr' format before applying the fit method of xgboost

For more details on sparse matrices, refer [here](https://rushter.com/blog/scipy-sparse-matrices/)
 * CSR - Compressed Sparse Row - usually used when the number of rows is less than the number of columns
 * CSC - Compressed Sparse Column - usually when there are lesser number of columns than rows

In [17]:
%%time

import xgboost as xgb
from scipy.sparse import csr_matrix # to convert the input into 

xgb_clf = xgb.XGBClassifier(n_estimators = 200, max_depth =7, learning_rate = 0.1, verbose = 2, colsample_bytree = 0.8, subsample =0.8, n_jobs=-1, nthread=10)

# convert the input into Compressed Sparse Column format
xgb_clf.fit(X_train_tfv.tocsc(), y_train)
predictions = xgb_clf.predict_proba(X_valid_tfv.tocsc())

print('Log loss using Xgboost on the original TF-IDF Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Xgboost on the original TF-IDF Vectorizer is :  0.8525357164606708
CPU times: user 6min 50s, sys: 5min 41s, total: 12min 32s
Wall time: 3min 13s


### Using Xgboost on the CountVectorizer

In [18]:
%%time
# convert the input into Compressed Sparse Column format

xgb_clf = xgb.XGBClassifier(n_estimators = 200, max_depth =7, learning_rate = 0.1, verbose = 2, colsample_bytree = 0.8, subsample =0.8, n_jobs=-1, nthread=10)

xgb_clf.fit(X_train_cv.tocsc(), y_train)
predictions = xgb_clf.predict_proba(X_valid_cv.tocsc())

print('Log loss using Xgboost on the original Count Vectorizer is : ', metrics.log_loss(y_valid, prediction))

Log loss using Xgboost on the original Count Vectorizer is :  0.8525357164606708
CPU times: user 1min 36s, sys: 2min 6s, total: 3min 42s
Wall time: 56.9 s


### Using Xgboost on the Tf-idf SVD features

In [19]:
%%time
# convert the input into Compressed Sparse Column format

xgb_clf.fit(X_train_tf_svd, y_train)
predictions = xgb_clf.predict_proba(X_valid_tf_svd)

print('Log loss using Xgboost on the original Count Vectorizer is : %0.3f' % metrics.log_loss(y_valid, prediction))

Log loss using Xgboost on the original Count Vectorizer is : 0.853
CPU times: user 1min 38s, sys: 1min 24s, total: 3min 3s
Wall time: 47.1 s


### Using GridSearch

In [20]:
# create a scoring function if you using a custom scorer

logloss_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better = False, needs_proba = True)

Create a pipeline

In [21]:
from sklearn.pipeline import Pipeline

# intialize SVD
svd = TruncatedSVD()

# initialize Standard Scaler
scaler = preprocessing.StandardScaler()

# logistic regression
logreg = LogisticRegression()

# Create a pipeline with Logistic Regression as the final estimator

pipe1 = Pipeline([
                    ('svd', svd),
                    ('scaler', scaler),
                    ('logreg', logreg)
                ])

* To get a view of the steps in the pipeline, use **pipe.named_steps()**
* To access the parameters in the pipelines, use **pipe.get_params()**

In [22]:
pipe1.get_params()

{'memory': None,
 'steps': [('svd',
   TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
                random_state=None, tol=0.0)),
  ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('logreg',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'svd': TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
              random_state=None, tol=0.0),
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'logreg': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l

Now set the values for the parameters in the grid. Define a dictionary for the same

In [23]:
param_grid = {
                'svd__n_components': [120, 180],
                'logreg__C': [0.1, 1.0, 10],
                'logreg__penalty': ['l2', 'l1']
            }

When using Grid Search,
1. To find the best score based on the scoring function: ** model.best_score_**
2. Best set of parameters: ** model.best_estimator_.get_params() **

In [24]:
%%time

# Intialize GridSearch Model

from sklearn.model_selection import GridSearchCV

# setting refit = True, takes the best parameters when traininf from the folds and retrains the model on the entire data using those best parameters
model = GridSearchCV(estimator = pipe1, param_grid = param_grid, scoring = logloss_scorer, cv = 2, refit= True, verbose=10)

# fit the gridsearch model. We can fit on the entire train, but here I will use only X_train with tf idf
model.fit(X_train_tfv, y_train)

print('Best score is : %0.3f' % model.best_score_)
print('Best parameters set:')
best_parameters = model.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" %(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=120, score=-0.745, total=   3.1s
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=120, score=-0.739, total=   3.2s
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.4s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=180, score=-0.685, total=   5.0s
[CV] logreg__C=0.1, logreg__penalty=l2, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.3s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l2, svd__n_components=180, score=-0.682, total=   4.9s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   16.2s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=120, score=nan, total=   3.1s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   19.4s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=120, score=nan, total=   3.1s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.4s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=180, score=nan, total=   4.6s
[CV] logreg__C=0.1, logreg__penalty=l1, svd__n_components=180 ........


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.0s remaining:    0.0s


[CV]  logreg__C=0.1, logreg__penalty=l1, svd__n_components=180, score=nan, total=   4.7s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   31.7s remaining:    0.0s


[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=120, score=-0.732, total=   3.2s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=120 ........


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   34.9s remaining:    0.0s


[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=120, score=-0.737, total=   3.2s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=180, score=-0.683, total=   4.9s
[CV] logreg__C=1.0, logreg__penalty=l2, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__penalty=l2, svd__n_components=180, score=-0.674, total=   4.9s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=120 ........
[CV]  logreg__C=1.0, logreg__penalty=l1, svd__n_components=120, score=nan, total=   3.2s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=120 ........
[CV]  logreg__C=1.0, logreg__penalty=l1, svd__n_components=120, score=nan, total=   3.1s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__penalty=l1, svd__n_components=180, score=nan, total=   4.7s
[CV] logreg__C=1.0, logreg__penalty=l1, svd__n_components=180 ........
[CV]  logreg__C=1.0, logreg__pe

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  1.6min finished


Best score is : -0.679
Best parameters set:
	logreg__C: 1.0
	logreg__penalty: 'l2'
	svd__n_components: 180
CPU times: user 2min 24s, sys: 31.3 s, total: 2min 55s
Wall time: 1min 42s


We will do the same using MultinomialNB model on tf-idf data

In [25]:
%%time

nb_model = MultinomialNB()

# create a pipeline

pipe2 = Pipeline([('nb', nb_model)])

# parameter grid
# try with the alpha parameter
# pipe2.get_params()

param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# intialize the Grid SearchCV model
model = GridSearchCV(estimator = pipe2, param_grid = param_grid, cv = 2, refit = True, scoring = logloss_scorer, verbose=10, n_jobs=-1)

model.fit(X_train_tfv, y_train)
print("best score is :",format(model.best_score_))
print('best parameters are: ')
best_parameters = model.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print('\t%s: %r' %(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best score is : -0.4518777178062823
best parameters are: 
	nb__alpha: 0.1
CPU times: user 232 ms, sys: 112 ms, total: 344 ms
Wall time: 1.32 s


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    1.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    1.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.3s finished


## Word vectors
Create word embeddings for tokens as it is customary in NLP tasks, which will give lot more insight into our data.
Here we will create `sentence vectors` and use them as inputs to a machine learning model. This can be implementes using different approaches such as:
1. Word2Vec
2. GloVe
3. FastText

The pickled version of GloVe is available [here](https://www.kaggle.com/authman/pickled-glove840b300d-for-10sec-loading)

Glove is a dictionary with the keys as the words or tokens and the values being the corresponding vector representation or word embeddings. In this version of glove the emebdding size is 300, which means for every key in this glove dictionary, the corresponding vector representation is a 300 D array

In [26]:
import pickle

# start = time.time()
# with open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as fp:
#     glove = pickle.load(fp)
# print('time taken to load the GloVe is {0:.1f} seconds:'.format(time.time()- start))

# print('Length of the glove', len(glove))
# print(type(glove))

# print(list(glove.keys())[:10])
# # word embedding for any key can be found out as 
# glove['the']

# del glove
# gc.collect()

I am using the **Glove vectors** available from Standford NLP which is available [here](http://www-nlp.stanford.edu/data/glove.840B.300d.zip)

Click that link to download the Glove vectors which are available in text format.

In [27]:
%%time
# load the GloVe vectors into a dictionary

# store the word embeddings
embedding_index ={}

# create file pointer
fp = open(r'../input/glove-840b300d-dj/glove.840B.300d.txt', 'r', encoding='utf8', errors='ignore')

# read line by line from the text file
for line in tqdm(fp):
    values = line.split()
    word = ''.join(values[:-300])
    #word = values[:-300][0] -- can use either. This is done to remove brackets as the line split results in array
    # create an array to store the embeddings for the word
    embed_vec = np.asarray(values[-300:], dtype= 'float32')
    embedding_index[word] = embed_vec

fp.close()

print('Found {} word vectors'.format(len(embedding_index)))

2196017it [05:42, 6411.94it/s]

Found 2195892 word vectors
CPU times: user 5min 39s, sys: 22.3 s, total: 6min 1s
Wall time: 5min 42s





Save the embedding dictonary for later use

In [28]:
pickle.dump(embedding_index, open('embedding_glove_dict.pkl', 'wb'))

Create a normalized vector for the entire sentence in the given text, based on the word embeddings from glove.
Using the `L2 norm` for normalization

In [29]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

Find the sentence embedding by the find the word embeddings with each sentence and normalizing them by their L2 norm

In [30]:
def sent2vec(s):
    sent = str(s).lower()
    words = word_tokenize(sent)
    words = [w for w in words if not w in stopwords]
    words = [w for w in words if w.isalpha()]
    
    embeddings_found = []
    
    # store the embeddings
    for w in words:
        try:
            embeddings_found.append(embedding_index[w])
        except:
            continue
    
    # convert to numpy array to perform array computation
    embeddings_found = np.array(embeddings_found)
    row_sum = embeddings_found.sum(axis=0)
    
    # if cannot find the word, fill with zeros
    if type(row_sum) != np.ndarray:
        return np.zeros(300)
    else:
        # square root of the sum of squares
        l2_norm = np.sqrt((row_sum ** 2).sum())
        #normalize by l2 norm
        return row_sum / l2_norm

Create sentence vector for both train and validations sets. What is needed is an array of (number of sentences X 300). 300 is the embedding size

In [31]:
%%time
# Can be implemented using the below as well

x_train_glove = X_train.apply(sent2vec)
x_valid_glove = X_valid.apply(sent2vec)

# x_train_glove = [sent2vec(sent) for sent in tqdm(X_train)]
# x_valid_glove = [sent2vec(sent) for sent in tqdm(X_valid)]

print(len(x_train_glove))
print(len(x_valid_glove))

17621
1958
CPU times: user 11.5 s, sys: 0 ns, total: 11.5 s
Wall time: 11.6 s


In [32]:
# convert to array format to use in machine learning model
x_train_glove = np.array(x_train_glove.values.tolist())
x_valid_glove = np.array(x_valid_glove.values.tolist())

# use below if using uncommenting the lines in the previous cell
# x_train_glove = np.array(x_train_glove)
# x_valid_glove = np.array(x_valid_glove)

print(x_train_glove.shape)
print(x_valid_glove.shape)

(17621, 300)
(1958, 300)


Fit the xgboost model on the glove features

In [33]:
%%time
xgb_clf = xgb.XGBClassifier(nthread=-1, silent=False) ## silent controls verbosity
xgb_clf.fit(x_train_glove, y_train)
predictions = xgb_clf.predict_proba(x_valid_glove)
print('Log loss using Xgboost on the Glove vectors for sentences is : {:.3f}'.format(metrics.log_loss(y_valid, prediction)))

Log loss using Xgboost on the Glove vectors for sentences is : 0.853
CPU times: user 4min 27s, sys: 0 ns, total: 4min 27s
Wall time: 4min 27s


With some tuning of hyperparameters, rerun the xgboost classifier

In [34]:
# Using another xgboost model that was used earlier
xgb_clf = xgb.XGBClassifier(n_estimators = 200, max_depth =7, learning_rate = 0.1, verbose = 2, colsample_bytree = 0.8, 
                            subsample =0.8, n_jobs=-1, nthread=10)

xgb_clf.fit(x_train_glove, y_train)
predictions = xgb_clf.predict_proba(x_valid_glove)
print('Log loss using Xgboost on the Glove vectors for sentences is : {:.3f}'.format(metrics.log_loss(y_valid, prediction)))

Log loss using Xgboost on the Glove vectors for sentences is : 0.853


### Deep learning models
Lets use some deep learning models to improve the score
- We will train a `LSTM` and a simple `Dense network` on the `Glove features`

In [35]:
# Scale the data before using in a neural network
std_scal = preprocessing.StandardScaler()

x_train_glove_scl = std_scal.fit_transform(x_train_glove)
x_valid_glove_scl = std_scal.transform(x_valid_glove)

In [36]:
%%time

# binarize the labels into vector representations before using in the neural network
from keras.utils import np_utils

y_train_encd = np_utils.to_categorical(y_train)
y_valid_encd = np_utils.to_categorical(y_valid)

print('Output after encoding the labels\n', y_valid_encd)

Using TensorFlow backend.


Output after encoding the labels
 [[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]
CPU times: user 2.77 s, sys: 599 ms, total: 3.37 s
Wall time: 7.63 s


Creating a simple **Sequential Neural Network** with 3 layers

In [37]:
# import the layes from keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding


model= Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))


print(model.summary())

# compile the model
model.compile(optimizer = 'adam', loss='categorical_crossentropy')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 300)               90300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_2 (Dense)              (None, 300)               90300     
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 300)               1200      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                

In [38]:
%%time
# fit the model
model.fit(x_train_glove_scl, y_train_encd, batch_size=64, epochs =5, verbose=1,
         validation_data = (x_valid_glove_scl, y_valid_encd))

Train on 17621 samples, validate on 1958 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 26.1 s, sys: 3.48 s, total: 29.6 s
Wall time: 14.5 s


<keras.callbacks.callbacks.History at 0x7f9734b38710>

The scores for a simple Dense network can be further improved by adding more layers and increase drop outs. In this case a simple NN is able to get better results than xgboost

### Using LSTMs
- To use LSTMs, we need to tokenize  the text data

In [39]:
%%time
from keras.preprocessing import text, sequence

max_words = None

# the max length for a sentence is, if more needs to be truncated else padded
max_len = 70

tokenizer = text.Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(list(X_train) + list(X_valid))

# seqeunce of numbers associated with the indexes
xtrain_seq = tokenizer.texts_to_sequences(list(X_train))
xvalid_seq = tokenizer.texts_to_sequences(list(X_valid))

# zero pad the sentences
xtrain_padded = sequence.pad_sequences(xtrain_seq, maxlen = max_len)
xvalid_padded = sequence.pad_sequences(xvalid_seq, maxlen = max_len)

# mapping of word to index
word_index = tokenizer.word_index

print('Number of tokens from the text:', len(word_index))

Number of tokens from the text: 25943
CPU times: user 2.17 s, sys: 5.55 ms, total: 2.18 s
Wall time: 2.17 s


In [40]:
print(X_valid.values[0])
print(xvalid_seq[0])

"I guess he's sayin' the spell," whispered Wheeler as he snatched back the telescope.
[6, 1249, 2555, 15996, 1, 2814, 1165, 7714, 16, 13, 4834, 165, 1, 2635]


Next create an `embedding matrix` for the words we have in the dataset
. This will be mapping of the word index and its word embedding

In [41]:
%%time

# each token is represented by a 300D vector or word embedding which was derived from GloVe
# initialize the matrix with zeros
embedding_matrix = np.zeros((len(word_index) + 1, 300))

for word, index in tqdm(word_index.items()):
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

100%|██████████| 25943/25943 [00:00<00:00, 189399.79it/s]

CPU times: user 191 ms, sys: 5.03 ms, total: 196 ms
Wall time: 195 ms





Implement a LSTM with glove embeddings and two dense layers

In [42]:
from keras.layers import SpatialDropout1D
from keras.layers.recurrent import GRU, LSTM

In [43]:
%%time

model = Sequential()

# model.add(Embedding(vocabulary size, hidden_size, input_length=num_steps))
model.add(
        Embedding(len(word_index)+1, 
                  300,
                  weights = [embedding_matrix],
                  input_length = max_len, # the max length of each sequence
                  trainable = False
                ))

# spatial dropouts implement dropout across a particular channel
model.add(SpatialDropout1D(0.3))

# model.add(LSTM(hidden_size, return_sequences=True, dropout = , recurrent_dropout =))
model.add(LSTM(100, dropout = 0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

print('Model summary', model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 70, 300)           7783200   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 70, 300)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              103424    
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)             

In [44]:
%%time
model.fit(xtrain_padded, y_train_encd, batch_size=512, epochs=5, verbose=1, validation_data=[xvalid_padded, y_valid_encd] )

Train on 17621 samples, validate on 1958 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 5min 6s, sys: 25.9 s, total: 5min 32s
Wall time: 1min 43s


<keras.callbacks.callbacks.History at 0x7f9739beba90>

May need to run this for more epochs, but I will use `early stopping` to stop if there is no improvment in the `loss`
.Compile the model again

In [45]:
from keras.callbacks import EarlyStopping
#?EarlyStopping

In [46]:
%%time

model = Sequential()

# model.add(Embedding(vocabulary size, hidden_size, input_length=num_steps))
model.add(
        Embedding(len(word_index)+1, 
                  300,
                  weights = [embedding_matrix],
                  input_length = max_len, # the max length of each sequence
                  trainable = False
                ))

# spatial dropouts implement dropout across a particular channel
model.add(SpatialDropout1D(0.3))

# model.add(LSTM(hidden_size, return_sequences=True, dropout = , recurrent_dropout =))
model.add(LSTM(100, dropout = 0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# implement early stopping callback

earlystop = EarlyStopping(monitor = 'val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.fit(xtrain_padded, y_train_encd, batch_size=512, epochs=20, verbose=1, validation_data=[xvalid_padded, y_valid_encd],
         callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 20min 34s, sys: 1min 37s, total: 22min 12s
Wall time: 6min 54s


<keras.callbacks.callbacks.History at 0x7f974d975710>

### Using Bidirectional LSTMs

Use a simpe bidirectional LSTM with glove embeddings and 2 dense layers

In [55]:
model = Sequential()
model.add(
            Embedding(input_dim = len(word_index)+1,
                      output_dim =300, 
                      weights = [embedding_matrix],
                      input_length = max_len,
                      trainable=False
                    ))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(xtrain_padded, y_train_encd, batch_size=512, epochs=20, verbose=1, validation_data=[xvalid_padded, y_valid_encd],
         callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f972e5d5bd0>

### Using GRU(gated recurrent unit) with 2 layers

In [56]:
%%time

model = Sequential()
model.add(
            Embedding(
                        input_dim = len(word_index)+1,
                        output_dim = 300,
                        weights = [embedding_matrix],
                        input_length = max_len,
                        trainable=False
            ))

model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

# fit the model with early stoppping
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.fit(xtrain_padded, y_train_encd, batch_size=512, epochs=20, verbose=1, validation_data=[xvalid_padded, y_valid_encd],
         callbacks=[earlystop])

SyntaxError: positional argument follows keyword argument (<unknown>, line 5)