In [22]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import xgboost as xgb
from tqdm import tqdm

from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

stop_words = stopwords.words('english')


In [2]:
train = pd.read_csv('data/train.zip')
#test = pd.read_csv('data/test.zip')

In [3]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """
    Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [5]:
le= preprocessing.LabelEncoder()
y = le.fit_transform(train['author'].values)

In [10]:
unique_elements, counts_elements = np.unique(y, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[[   0    1    2]
 [7900 5635 6044]]


In [11]:
# check if the y has been converted correctly
train['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [12]:
# creating validation
x_train, x_val, y_train, y_val = train_test_split(train['text'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [13]:
print(x_train.shape)
print(x_val.shape)

(17621,)
(1958,)


TF-IDF(Term Frequency - Inverse Document Frequency) with Logistic Regression

In [14]:
tfidf = TfidfVectorizer(min_df=3, 
                        max_features=None, 
                        strip_accents='unicode', 
                        analyzer='word', 
                        token_pattern=r'\w{1,}',
                        ngram_range=(1, 3), use_idf=1,
                        smooth_idf=1,
                        sublinear_tf=1,
                        stop_words = 'english')

tfidf.fit(list(x_train), list(x_val))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=3, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=1, stop_words='english', strip_accents='unicode',
                sublinear_tf=1, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=1, vocabulary=None)

In [15]:
x_train_tfidf =  tfidf.transform(x_train) 
x_val_tfidf = tfidf.transform(x_val)

In [27]:
log_tfidf= LogisticRegression(C= 1)#C = 1/λ for regularization

log_tfidf.fit(x_train_tfidf, y_train)
pred = log_tfidf.predict_proba(x_val_tfidf)

f"logloss with TFIDF: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with TFIDF: 0.569 %'

word counts

In [24]:
cv = CountVectorizer(analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1, 3), 
                     stop_words = 'english')

cv.fit(list(x_train), list(x_val))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [25]:
x_train_cv =  cv.transform(x_train) 
x_val_cv = cv.transform(x_val)

In [28]:
log_cv= LogisticRegression(C= 1)#C = 1/λ for regularization

log_cv.fit(x_train_cv, y_train)
pred = log_cv.predict_proba(x_val_cv)

f"logloss with CounterVectorizer: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with CounterVectorizer: 0.527 %'

Naive Bayes on TFIDF

In [29]:
nb = MultinomialNB()

nb.fit(x_train_tfidf, y_train)
pred = nb.predict_proba(x_val_tfidf)

f"logloss with Naive Bayes on TFIDF: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with Naive Bayes on TFIDF: 0.578 %'

Naive Bayes on CounterVerctorizer

In [31]:
nb = MultinomialNB()

nb.fit(x_train_cv, y_train)
pred = nb.predict_proba(x_val_cv)

f"logloss with Naive Bayes on CV: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with Naive Bayes on CV: 0.487 %'

SVM on TFDIF

Since SVMs take a lot of time, I reduce the number of features from the TF-IDF using Singular Value Decomposition before applying SVM. Also, I will do standardize the data!

In [32]:
# take out the 100 components
svd = decomposition.TruncatedSVD(n_components=100)

svd.fit(x_train_tfidf)
x_train_svd = svd.transform(x_train_tfidf)
x_val_svd = svd.transform(x_val_tfidf)

In [33]:
# standardize

sc = preprocessing.StandardScaler()
sc.fit(x_train_svd)
x_train_svd_sc = sc.transform(x_train_svd)
x_val_svd_sc = sc.transform(x_val_svd)

In [35]:
svc = SVC(C=1.0, probability=True) 
svc.fit(x_train_svd_sc, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [37]:
pred = svc.predict_proba(x_val_svd_sc)

f"logloss with SVM on TFIDF: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with SVM on TFIDF: 0.746 %'

SVM on CV

In [38]:
svd.fit(x_train_cv)
x_train_svd = svd.transform(x_train_cv)
x_val_svd = svd.transform(x_val_cv)

sc.fit(x_train_svd)
x_train_svd_sc = sc.transform(x_train_svd)
x_val_svd_sc = sc.transform(x_val_svd)

svc = SVC(C=1.0, probability=True) 
svc.fit(x_train_svd_sc, y_train)


NameError: name 'x_val_svc_sc' is not defined

In [39]:
pred = svc.predict_proba(x_val_svd_sc)

f"logloss with SVM on CV: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with SVM on CV: 0.79 %'

XG Boost 

In [40]:
xg = xgb.XGBClassifier(max_depth=7, 
                       n_estimators=200,
                       colsample_bytree=0.8,
                       subsample=0.8,
                       nthread=10,
                       learning_rate=0.1)

xg.fit(x_train_svd, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=10, nthread=10, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method=None, validate_parameters=False, verbosity=None)

In [42]:
pred = xg.predict_proba(x_val_svd)

f"logloss with XGboost on CV decomposed: {round(multiclass_logloss(y_val, pred),3)} %"

'logloss with XGboost on CV decomposed: 0.799 %'

Overall, NB on Counter Vectorizer performed the best. What if I use Grid Search?

In [43]:
# scoring method
from sklearn.metrics import make_scorer

scores = metrics.make_scorer(multiclass_logloss, 
                            greater_is_better= False,
                            needs_proba= True)

In [58]:
trunc = TruncatedSVD()
sc = preprocessing.StandardScaler()
lg = LogisticRegression()

# pipeline
mod_pipe= pipeline.Pipeline([('trunc', trunc),
                          ('sc', sc),
                          ('lg', lg)])

# grid
param_grid = {'trunc__n_components':[100,200],
              'lg__C': [0.1, 1, 10],
              'lg__penalty':['l1','l2']
             }

In [59]:
model = GridSearchCV(estimator=mod_pipe,
                     param_grid=param_grid,
                     scoring=scores,
                     verbose=10,
                     n_jobs=-1,
                     iid= True,
                     refit= True,
                     cv=2)

model.fit(x_train_tfidf, y_train)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")

best_parameters = model.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:   22.0s remaining:   22.0s
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:   22.4s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:   29.2s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:   31.3s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   31.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   31.9s finished


Best score: -0.731
Best parameters set:
	lg__C: 1
	lg__penalty: 'l2'
	trunc__n_components: 200


Somehow, this got worse than original.. 

Grid Search in NB

In [62]:
nb = MultinomialNB()

# Create the pipeline 
mod_pipe = pipeline.Pipeline([('nb', nb)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=mod_pipe, 
                     param_grid=param_grid, 
                     scoring=scores,
                     verbose=10,
                     n_jobs=-1,
                     iid=True,
                     refit=True,
                     cv=2)

# Fit Grid Search Model
model.fit(x_train_tfidf, y_train)  
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")

best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0598s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished


Best score: -0.492
Best parameters set:
	nb__alpha: 0.1


This is about 8% improvement from the previous NB on tfidf