In [51]:
import pandas as pd
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import numpy, textblob, string
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp
import xgboost 

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44183 entries, 0 to 44182
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  44183 non-null  object
 1   Outcome  44183 non-null  int64 
 2   Id       44183 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


## 1. Data Pre-processing

In [16]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = ''
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(stems)
    return lemmas


In [17]:
# Denoise text: Replace contradtions
df_train["clean"] = df_train.Comment.apply(replace_contractions)
df_train.head(10)

Unnamed: 0,Comment,Outcome,Id,clean
0,combining lindelof's and gregg lind's ideas: l...,1,2994,combining lindelof's and gregg lind's ideas: l...
1,in most cases r is an interpreted language tha...,1,22730,in most cases r is an interpreted language tha...
2,you can drop any row containing a missing usin...,1,49407,you can drop any row containing a missing usin...
3,you need to use strptime() to convert the stri...,1,26239,you need to use strptime() to convert the stri...
4,"i'm no r expert, but most languages use a refe...",1,35866,"I am no r expert, but most languages use a ref..."
5,"i don't know r at all, but a bit of creative g...",1,19528,"i do not know r at all, but a bit of creative ..."
6,if you don't want to modify the list in-place ...,1,36784,if you do not want to modify the list in-place...
7,i assume it helps if the matrix is sparse? yes...,1,22146,i assume it helps if the matrix is sparse? yes...
8,if you're willing to entertain an alternate pl...,1,48389,if you are willing to entertain an alternate p...
9,see ?order. you just need the last index (or f...,1,36037,see ?order. you just need the last index (or f...


In [19]:
# Tokenize the text: Using nltk packages
df_train["clean"]  = df_train["clean"].apply(nltk.word_tokenize)
df_train.head(10)

Unnamed: 0,Comment,Outcome,Id,clean
0,combining lindelof's and gregg lind's ideas: l...,1,2994,"[combining, lindelof, 's, and, gregg, lind, 's..."
1,in most cases r is an interpreted language tha...,1,22730,"[in, most, cases, r, is, an, interpreted, lang..."
2,you can drop any row containing a missing usin...,1,49407,"[you, can, drop, any, row, containing, a, miss..."
3,you need to use strptime() to convert the stri...,1,26239,"[you, need, to, use, strptime, (, ), to, conve..."
4,"i'm no r expert, but most languages use a refe...",1,35866,"[I, am, no, r, expert, ,, but, most, languages..."
5,"i don't know r at all, but a bit of creative g...",1,19528,"[i, do, not, know, r, at, all, ,, but, a, bit,..."
6,if you don't want to modify the list in-place ...,1,36784,"[if, you, do, not, want, to, modify, the, list..."
7,i assume it helps if the matrix is sparse? yes...,1,22146,"[i, assume, it, helps, if, the, matrix, is, sp..."
8,if you're willing to entertain an alternate pl...,1,48389,"[if, you, are, willing, to, entertain, an, alt..."
9,see ?order. you just need the last index (or f...,1,36037,"[see, ?, order, ., you, just, need, the, last,..."


In [20]:
# Normalize the list of words (tokens)
df_train["clean"]  = df_train["clean"].apply(normalize)
df_train.head(10)

Unnamed: 0,Comment,Outcome,Id,clean
0,combining lindelof's and gregg lind's ideas: l...,1,2994,"[combining, lindelof, gregg, lind, ideas, last..."
1,in most cases r is an interpreted language tha...,1,22730,"[cases, r, interpreted, language, runs, readev..."
2,you can drop any row containing a missing usin...,1,49407,"[drop, row, containing, missing, using, naomit..."
3,you need to use strptime() to convert the stri...,1,26239,"[need, use, strptime, convert, string, date, e..."
4,"i'm no r expert, but most languages use a refe...",1,35866,"[r, expert, languages, use, reference, countin..."
5,"i don't know r at all, but a bit of creative g...",1,19528,"[know, r, bit, creative, googling, led, http, ..."
6,if you don't want to modify the list in-place ...,1,36784,"[want, modify, list, inplace, eg, passing, lis..."
7,i assume it helps if the matrix is sparse? yes...,1,22146,"[assume, helps, matrix, sparse, yes, algorithm..."
8,if you're willing to entertain an alternate pl...,1,48389,"[willing, entertain, alternate, plotting, pack..."
9,see ?order. you just need the last index (or f...,1,36037,"[see, order, need, last, index, first, decreas..."


In [21]:
# Stem/Lementize the list of words (tokens)
df_train["clean"]  = df_train["clean"].apply(stem_and_lemmatize)
df_train.head(10)

Unnamed: 0,Comment,Outcome,Id,clean
0,combining lindelof's and gregg lind's ideas: l...,1,2994,"[combin, lindelof, greg, lind, idea, last, fun..."
1,in most cases r is an interpreted language tha...,1,22730,"[cas, r, interpret, langu, run, readevaluatepr..."
2,you can drop any row containing a missing usin...,1,49407,"[drop, row, contain, miss, us, naomit, howev, ..."
3,you need to use strptime() to convert the stri...,1,26239,"[nee, us, strptime, convert, string, dat, exam..."
4,"i'm no r expert, but most languages use a refe...",1,35866,"[r, expert, langu, us, ref, count, scheme, big..."
5,"i don't know r at all, but a bit of creative g...",1,19528,"[know, r, bite, cre, googl, lead, http, tolsto..."
6,if you don't want to modify the list in-place ...,1,36784,"[want, mod, list, inplac, eg, pass, list, el, ..."
7,i assume it helps if the matrix is sparse? yes...,1,22146,"[assum, help, matrix, spar, ye, algorithm, per..."
8,if you're willing to entertain an alternate pl...,1,48389,"[wil, entertain, altern, plot, pack, ggplot2, ..."
9,see ?order. you just need the last index (or f...,1,36037,"[see, ord, nee, last, index, first, decreas, o..."


In [22]:
# Detokenize the list of words back to text
df_train["clean"]  = df_train["clean"].apply(TreebankWordDetokenizer().detokenize)
df_train.head(10)

Unnamed: 0,Comment,Outcome,Id,clean
0,combining lindelof's and gregg lind's ideas: l...,1,2994,combin lindelof greg lind idea last funct x ta...
1,in most cases r is an interpreted language tha...,1,22730,cas r interpret langu run readevaluateprint lo...
2,you can drop any row containing a missing usin...,1,49407,drop row contain miss us naomit howev want mor...
3,you need to use strptime() to convert the stri...,1,26239,nee us strptime convert string dat exampl strp...
4,"i'm no r expert, but most languages use a refe...",1,35866,r expert langu us ref count scheme big object ...
5,"i don't know r at all, but a bit of creative g...",1,19528,know r bite cre googl lead http tolstoynewcast...
6,if you don't want to modify the list in-place ...,1,36784,want mod list inplac eg pass list el remov fun...
7,i assume it helps if the matrix is sparse? yes...,1,22146,assum help matrix spar ye algorithm perform we...
8,if you're willing to entertain an alternate pl...,1,48389,wil entertain altern plot pack ggplot2 autom s...
9,see ?order. you just need the last index (or f...,1,36037,see ord nee last index first decreas ord trick...


## 2. Feature Engineering:

In [32]:
# Doing train and test splition first:
# Notice that this is just a simple train test split not CV, since I am just going to use this validation result to 
# specify which features to use and which models to tune in the finalized infrustructure: not final
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_train["clean"], df_train["Outcome"])

In [33]:
# 1. Count Vector features
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df_train["clean"])
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [36]:
xtrain_count

<33137x158473 sparse matrix of type '<class 'numpy.int64'>'
	with 1180663 stored elements in Compressed Sparse Row format>

In [37]:
xvalid_count

<11046x158473 sparse matrix of type '<class 'numpy.int64'>'
	with 392937 stored elements in Compressed Sparse Row format>

In [38]:
# 2. TF-IDF features
# With 3 levels:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df_train["clean"])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df_train["clean"])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df_train["clean"])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [30]:
# 4. Text / NLP based features
# create more useful features:
df_create = df_train[['Outcome']]
df_create['char_count'] = df_train['Comment'].apply(len)
df_create['word_count'] = df_train['Comment'].apply(lambda x: len(x.split()))
df_create['word_density'] = df_create['char_count'] / (df_create['word_count']+1)
df_create['punctuation_count'] = df_train['Comment'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df_create['noun_count'] = df_train['Comment'].apply(lambda x: check_pos_tag(x, 'noun'))
df_create['verb_count'] = df_train['Comment'].apply(lambda x: check_pos_tag(x, 'verb'))
df_create['adj_count'] = df_train['Comment'].apply(lambda x: check_pos_tag(x, 'adj'))
df_create['adv_count'] = df_train['Comment'].apply(lambda x: check_pos_tag(x, 'adv'))
df_create['pron_count'] = df_train['Comment'].apply(lambda x: check_pos_tag(x, 'pron'))
df_create.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Outcome,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,1,435,80,5.37037,51,30,9,9,6,1
1,1,234,39,5.85,4,10,8,7,1,0
2,1,584,113,5.122807,65,27,27,10,6,6
3,1,436,67,6.411765,52,34,16,1,6,5
4,1,254,49,5.08,9,18,8,2,3,2
5,1,894,164,5.418182,49,49,30,14,10,4
6,1,567,102,5.504854,92,54,18,8,6,2
7,1,165,23,6.875,14,9,6,0,2,1
8,1,377,63,5.890625,18,19,13,4,6,3
9,1,136,20,6.47619,16,10,5,2,2,1


In [17]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [18]:
topic_summaries

['data aes plot ggplot label group library x ggplot2 c',
 'model data fit import x cluster plot predict train 0000',
 'func window def server address stream connect port tkinter mysql',
 'path directory file root folder project windows fn src ospathjoin',
 'listpl3d7bff1ddbdaafe5 usa unite slope sales key2 population temperature outliers anaconda',
 'plot text word color col eval h w blue mtcars',
 'django python app language model database application user obj password',
 'date year format datetime df plyr asdate days offset echo',
 'p array k numpy q point import ax shape decimal',
 'input user template a1 virtualenv output flask view elapse 000',
 'install package http url pip request instal import python https',
 'int encode item unicode utf8 title expr bytes ascii microbenchmark',
 'mydata message member msg country blah import sign cursor params',
 'false true key none value cat return node xml td',
 'form product t1 txt ratio rank queryset polygons time relationship',
 'use pyth

## 3. Choice of Models & Train Validations

In [55]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    
    
    if is_neural_net:
    # fit the training dataset on the classifier
        classifier.fit(feature_vector_train, label, epochs=3)
    # predict the labels on validation dataset
        predictions = classifier.predict(feature_vector_valid)
        predictions = predictions.argmax(axis=-1)
    else:
        classifier.fit(feature_vector_train, label)
        predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

In [40]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.6571609632446135
NB, WordLevel TF-IDF:  0.6633170378417527
NB, N-Gram Vectors:  0.6388738004707586
NB, CharLevel Vectors:  0.6377874343653811


In [41]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print( "LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR, Count Vectors:  0.678164041281912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR, WordLevel TF-IDF:  0.6938258193011044


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR, N-Gram Vectors:  0.6577946768060836
LR, CharLevel Vectors:  0.6859496650371175


In [57]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print ("SVM, Count Vectors: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("SVM, CharLevel Vectors: ", accuracy)

SVM, Count Vectors:  0.6887561108093427
SVM, WordLevel TF-IDF:  0.6971754481260185
SVM, N-Gram Vectors:  0.6502806445772226
SVM, CharLevel Vectors:  0.6969943871084555


In [44]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print( "RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)

# RF on Ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print( "RF, N-Gram Vectors: ", accuracy)

# RF on Character Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("RF, CharLevel Vectors: ", accuracy)

RF, Count Vectors:  0.6840485243527069
RF, WordLevel TF-IDF:  0.6902045989498461
RF, N-Gram Vectors:  0.6391453919971031
RF, CharLevel Vectors:  0.6697447039652363


In [58]:
# Adaboost on Count Vectors
accuracy = train_model(ensemble.AdaBoostClassifier(), xtrain_count, train_y, xvalid_count)
print( "Adaboost, Count Vectors: ", accuracy)

# Adaboost on Word Level TF IDF Vectors
accuracy = train_model(ensemble.AdaBoostClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("Adaboost, WordLevel TF-IDF: ", accuracy)

# Adaboost on Ngram Level TF IDF Vectors
accuracy = train_model(ensemble.AdaBoostClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print( "Adaboost, N-Gram Vectors: ", accuracy)

# Adaboost on Character Level TF IDF Vectors
accuracy = train_model(ensemble.AdaBoostClassifier(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("Adaboost, CharLevel Vectors: ", accuracy)

Adaboost, Count Vectors:  0.6637696903856599
Adaboost, WordLevel TF-IDF:  0.6618685497012493
Adaboost, N-Gram Vectors:  0.5816585189208764
Adaboost, CharLevel Vectors:  0.6606011225783089


In [52]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
print( "Xgb, N-Gram Vectors: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.6673003802281369
Xgb, WordLevel TF-IDF:  0.6653087090349448
Xgb, N-Gram Vectors:  0.5831975375701611
Xgb, CharLevel Vectors:  0.6701973565091436


In [56]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="softmax")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier_count = create_model_architecture(xtrain_count.shape[1])
classifier_tfidf = create_model_architecture(xtrain_tfidf.shape[1])
classifier_tfidf_ngram = create_model_architecture(xtrain_tfidf_ngram.shape[1])
classifier_tfidf_character = create_model_architecture(xtrain_tfidf_ngram_chars.shape[1])

accuracy = train_model(classifier_count, xtrain_count, train_y, xvalid_count, is_neural_net=True)
print("NN, Count Vectors",  accuracy)

accuracy = train_model(classifier_tfidf, xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True)
print("NN, WordLevel TF-IDF",  accuracy)

accuracy = train_model(classifier_tfidf_ngram, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print("NN, Ngram Level TF IDF Vectors",  accuracy)

accuracy = train_model(classifier_tfidf_character, xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, is_neural_net=True)
print("NN, CharLevel Vectors",  accuracy)

Epoch 1/3
Epoch 2/3
Epoch 3/3
NN, Count Vectors 0.4384392540286076
Epoch 1/3
Epoch 2/3
Epoch 3/3
NN, WordLevel TF-IDF 0.4384392540286076
Epoch 1/3
Epoch 2/3
Epoch 3/3
NN, Ngram Level TF IDF Vectors 0.4384392540286076
Epoch 1/3
Epoch 2/3
Epoch 3/3
NN, CharLevel Vectors 0.4384392540286076


In [75]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier

## 4. Stacking Models:

### 4.1. Logistic Classifier:

In [61]:
# Grid search cross validation and parameter tuning:
logreg = LogisticRegression()
logreg_grid = {"C":[0.01, 0.1, 1, 10], "penalty":["l1","l2"]} # l1 lasso l2 ridge
logreg_cv = GridSearchCV(logreg,logreg_grid,cv=5,verbose=10)
logreg_cv.fit(xtrain_tfidf, train_y)
print(logreg_cv.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................... C=0.01, penalty=l1, score=nan, total=   0.0s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................... C=0.01, penalty=l1, score=nan, total=   0.0s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................... C=0.01, penalty=l1, score=nan, total=   0.0s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................... C=0.01, penalty=l1, score=nan, total=   0.0s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................... C=0.01, penalty=l1, score=nan, total=   0.0s
[CV] C=0.01, penalty=l2 ..............................................
[CV] .................. C=0.01, penalty=l2, score=0.578, total=   0.1s
[CV] C=0.01, penalty=l2 ..............................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s remaining:    0.0s


[CV] .................. C=0.01, penalty=l2, score=0.577, total=   0.1s
[CV] C=0.01, penalty=l2 ..............................................
[CV] .................. C=0.01, penalty=l2, score=0.577, total=   0.1s
[CV] C=0.01, penalty=l2 ..............................................
[CV] .................. C=0.01, penalty=l2, score=0.578, total=   0.1s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s remaining:    0.0s
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ................... C=0.1, penalty=l2, score=0.677, total=   0.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................... C=0.1, penalty=l2, score=0.675, total=   0.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................... C=0.1, penalty=l2, score=0.671, total=   0.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................... C=0.1, penalty=l2, score=0.674, total=   0.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................... C=0.1, penalty=l2, score=0.668, total=   0.2s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..................... C=1, penalty=l2, score=0.688, total=   0.5s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.685, total=   0.5s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.682, total=   0.5s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.688, total=   0.5s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..................... C=1, penalty=l2, score=0.680, total=   0.5s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.673, total=   0.4s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.675, total=   0.5s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.671, total=   0.5s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................... C=10, penalty=l2, score=0.674, total=   0.5s
[CV] C=10, penalty=l2 ................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    6.2s finished


[CV] .................... C=10, penalty=l2, score=0.668, total=   0.5s
{'C': 1, 'penalty': 'l2'}
0.693735288792323


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [68]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob_logreg_cv = logreg_cv.predict_proba(xvalid_tfidf)[:, 1]
y_pred_prob_logreg_cv
# make class predictions for X_test_dtm
y_pred_class_logreg_cv = logreg_cv.predict(xvalid_tfidf)
# calculate accuracy
print(metrics.accuracy_score(valid_y, y_pred_class_logreg_cv))

0.693735288792323


### 4.2. Random Forest:

In [63]:
# import and instantiate a logistic regression model
rf = RandomForestClassifier()
rf_grid = {'n_estimators': [100, 200, 500],
           'max_features': ['sqrt','auto'],
           'max_depth': [10, 20, 50]}
rf_cv = GridSearchCV(rf,rf_grid,cv=5,verbose=10)
rf_cv.fit(xtrain_tfidf, train_y)
print(rf_cv.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] max_depth=10, max_features=sqrt, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, max_features=sqrt, n_estimators=100, score=0.598, total=   2.4s
[CV] max_depth=10, max_features=sqrt, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=100, score=0.601, total=   2.0s
[CV] max_depth=10, max_features=sqrt, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.4s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=100, score=0.597, total=   1.9s
[CV] max_depth=10, max_features=sqrt, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.3s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=100, score=0.594, total=   2.0s
[CV] max_depth=10, max_features=sqrt, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.3s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=100, score=0.593, total=   1.9s
[CV] max_depth=10, max_features=sqrt, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.2s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=200, score=0.594, total=   4.0s
[CV] max_depth=10, max_features=sqrt, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   14.2s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=200, score=0.599, total=   3.9s
[CV] max_depth=10, max_features=sqrt, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   18.1s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=200, score=0.592, total=   3.8s
[CV] max_depth=10, max_features=sqrt, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   21.9s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=200, score=0.590, total=   3.6s
[CV] max_depth=10, max_features=sqrt, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   25.4s remaining:    0.0s


[CV]  max_depth=10, max_features=sqrt, n_estimators=200, score=0.594, total=   3.6s
[CV] max_depth=10, max_features=sqrt, n_estimators=500 ...............
[CV]  max_depth=10, max_features=sqrt, n_estimators=500, score=0.600, total=   9.7s
[CV] max_depth=10, max_features=sqrt, n_estimators=500 ...............
[CV]  max_depth=10, max_features=sqrt, n_estimators=500, score=0.596, total=   9.0s
[CV] max_depth=10, max_features=sqrt, n_estimators=500 ...............
[CV]  max_depth=10, max_features=sqrt, n_estimators=500, score=0.591, total=   8.8s
[CV] max_depth=10, max_features=sqrt, n_estimators=500 ...............
[CV]  max_depth=10, max_features=sqrt, n_estimators=500, score=0.596, total=   9.0s
[CV] max_depth=10, max_features=sqrt, n_estimators=500 ...............
[CV]  max_depth=10, max_features=sqrt, n_estimators=500, score=0.592, total=   9.1s
[CV] max_depth=10, max_features=auto, n_estimators=100 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=100, score=0.597, 

[CV]  max_depth=50, max_features=sqrt, n_estimators=100, score=0.677, total=  18.2s
[CV] max_depth=50, max_features=sqrt, n_estimators=100 ...............
[CV]  max_depth=50, max_features=sqrt, n_estimators=100, score=0.664, total=  18.5s
[CV] max_depth=50, max_features=sqrt, n_estimators=100 ...............
[CV]  max_depth=50, max_features=sqrt, n_estimators=100, score=0.671, total=  19.4s
[CV] max_depth=50, max_features=sqrt, n_estimators=200 ...............
[CV]  max_depth=50, max_features=sqrt, n_estimators=200, score=0.679, total=  38.4s
[CV] max_depth=50, max_features=sqrt, n_estimators=200 ...............
[CV]  max_depth=50, max_features=sqrt, n_estimators=200, score=0.694, total=  36.7s
[CV] max_depth=50, max_features=sqrt, n_estimators=200 ...............
[CV]  max_depth=50, max_features=sqrt, n_estimators=200, score=0.676, total=  41.1s
[CV] max_depth=50, max_features=sqrt, n_estimators=200 ...............
[CV]  max_depth=50, max_features=sqrt, n_estimators=200, score=0.668, 

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 34.0min finished


{'max_depth': 50, 'max_features': 'sqrt', 'n_estimators': 200}
0.693735288792323


In [67]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob_RandomForest = rf_cv.predict_proba(xvalid_tfidf)[:, 1]
y_pred_prob_RandomForest
# make class predictions for X_test_dtm
y_pred_class_RandomForest = rf_cv.predict(xvalid_tfidf)
# calculate accuracy
print(metrics.accuracy_score(valid_y, y_pred_class_RandomForest))

0.6794314684048525


### 4.3. XGB:

In [72]:
xgb = xgboost.XGBClassifier()
xgb_grid = {'learning_rate': [0.1, 0.2, 0.5],
             'n_estimators': [100, 200, 500],
             'gamma': [0, 0.1]}
xgb_cv = GridSearchCV(xgb,xgb_grid,cv=5,verbose=10)
xgb_cv.fit(xtrain_tfidf, train_y)
print(xgb_cv.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] gamma=0, learning_rate=0.1, n_estimators=100 ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  gamma=0, learning_rate=0.1, n_estimators=100, score=0.658, total=  10.5s
[CV] gamma=0, learning_rate=0.1, n_estimators=100 ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.5s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=100, score=0.672, total=   9.9s
[CV] gamma=0, learning_rate=0.1, n_estimators=100 ....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.5s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=100, score=0.664, total=   9.7s
[CV] gamma=0, learning_rate=0.1, n_estimators=100 ....................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   30.2s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=100, score=0.653, total=   9.7s
[CV] gamma=0, learning_rate=0.1, n_estimators=100 ....................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   40.0s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=100, score=0.654, total=   9.7s
[CV] gamma=0, learning_rate=0.1, n_estimators=200 ....................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   49.7s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=200, score=0.672, total=  19.1s
[CV] gamma=0, learning_rate=0.1, n_estimators=200 ....................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.1min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=200, score=0.680, total=  19.0s
[CV] gamma=0, learning_rate=0.1, n_estimators=200 ....................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=200, score=0.677, total=  19.0s
[CV] gamma=0, learning_rate=0.1, n_estimators=200 ....................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.8min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=200, score=0.666, total=  19.0s
[CV] gamma=0, learning_rate=0.1, n_estimators=200 ....................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.1, n_estimators=200, score=0.666, total=  19.0s
[CV] gamma=0, learning_rate=0.1, n_estimators=500 ....................
[CV]  gamma=0, learning_rate=0.1, n_estimators=500, score=0.681, total=  46.9s
[CV] gamma=0, learning_rate=0.1, n_estimators=500 ....................
[CV]  gamma=0, learning_rate=0.1, n_estimators=500, score=0.693, total=  46.9s
[CV] gamma=0, learning_rate=0.1, n_estimators=500 ....................
[CV]  gamma=0, learning_rate=0.1, n_estimators=500, score=0.685, total=  46.2s
[CV] gamma=0, learning_rate=0.1, n_estimators=500 ....................
[CV]  gamma=0, learning_rate=0.1, n_estimators=500, score=0.681, total=  46.2s
[CV] gamma=0, learning_rate=0.1, n_estimators=500 ....................
[CV]  gamma=0, learning_rate=0.1, n_estimators=500, score=0.678, total=  46.3s
[CV] gamma=0, learning_rate=0.2, n_estimators=100 ....................
[CV]  gamma=0, learning_rate=0.2, n_estimators=100, score=0.666, total=   9.7s
[CV] gamma=0, learnin

[CV]  gamma=0.1, learning_rate=0.2, n_estimators=100, score=0.670, total=  10.7s
[CV] gamma=0.1, learning_rate=0.2, n_estimators=200 ..................
[CV]  gamma=0.1, learning_rate=0.2, n_estimators=200, score=0.676, total=  18.8s
[CV] gamma=0.1, learning_rate=0.2, n_estimators=200 ..................
[CV]  gamma=0.1, learning_rate=0.2, n_estimators=200, score=0.688, total=  19.0s
[CV] gamma=0.1, learning_rate=0.2, n_estimators=200 ..................
[CV]  gamma=0.1, learning_rate=0.2, n_estimators=200, score=0.679, total=  18.9s
[CV] gamma=0.1, learning_rate=0.2, n_estimators=200 ..................
[CV]  gamma=0.1, learning_rate=0.2, n_estimators=200, score=0.671, total=  19.1s
[CV] gamma=0.1, learning_rate=0.2, n_estimators=200 ..................
[CV]  gamma=0.1, learning_rate=0.2, n_estimators=200, score=0.674, total=  19.8s
[CV] gamma=0.1, learning_rate=0.2, n_estimators=500 ..................
[CV]  gamma=0.1, learning_rate=0.2, n_estimators=500, score=0.689, total=  48.6s
[CV] ga

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 38.4min finished


{'gamma': 0.1, 'learning_rate': 0.2, 'n_estimators': 500}


In [73]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob_clf_XGB  = xgb_cv.predict_proba(xvalid_tfidf)[:, 1]
y_pred_prob_clf_XGB 
# make class predictions for X_test_dtm
y_pred_class_clf_XGB = xgb_cv.predict(xvalid_tfidf)
# calculate accuracy
print(metrics.accuracy_score(valid_y, y_pred_class_clf_XGB))

0.700615607459714


### 4.4. AdaBoost

In [69]:
kfold = StratifiedKFold(n_splits=5, random_state=2017)
num_rounds = 10
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=200)
# Use early_stopping_rounds to stop the cv when there is no score imporovement
adaboost.fit(xtrain_tfidf, train_y)



AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [70]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob_adaboost  = adaboost.predict_proba(xvalid_tfidf)[:, 1]
y_pred_prob_adaboost
# make class predictions for X_test_dtm
y_pred_class_adaboost = adaboost.predict(xvalid_tfidf)
# calculate accuracy
print(metrics.accuracy_score(valid_y, y_pred_class_adaboost))

0.6811515480717002


### 4.5. MLP NN

In [77]:
NN = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=1,activation = 'relu',solver='adam',random_state=1,verbose=10)
NN.fit(xtrain_tfidf, train_y)
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob_NN = NN.predict_proba(xvalid_tfidf)[:, 1]
y_pred_prob_NN
# make class predictions for X_test_dtm
y_pred_class_NN = NN.predict(xvalid_tfidf)
# calculate accuracy
metrics.accuracy_score(valid_y, y_pred_class_NN)

Iteration 1, loss = 0.62191475




0.6923773311606011

### 4.6. ExtraTree Classifier

In [79]:
extratree = ExtraTreesClassifier()
# Use early_stopping_rounds to stop the cv when there is no score imporovement
extratree.fit(xtrain_tfidf, train_y)
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob_extratree = extratree.predict_proba(xvalid_tfidf)[:, 1]
y_pred_prob_extratree
# make class predictions for X_test_dtm
y_pred_class_extratree = extratree.predict(xvalid_tfidf)
# calculate accuracy
metrics.accuracy_score(valid_y, y_pred_class_extratree)

0.6943690023537932

### 4.7. CNN

In [80]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np
plt.switch_backend('agg')
%matplotlib inline

In [139]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.25001

In [83]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df_train["clean"])
sequences = tokenizer.texts_to_sequences(df_train["clean"])

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 137164


In [147]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(df_train["Outcome"]))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Shape of train Data Tensor:', x_train.shape)
print('Shape of validate Data Tensor:', x_val.shape)

Shape of Data Tensor: (44183, 1000)
Shape of Label Tensor: (44183, 2)
Shape of train Data Tensor: (33137, 1000)
Shape of validate Data Tensor: (11046, 1000)


In [142]:
embeddings_index = {}
f = open('glove.6B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 50d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 50d.


In [143]:
# Word Embedding:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [144]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)


Simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 50)          6858250   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          32128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None,

In [148]:
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=3, batch_size=2,callbacks=[cp])

Train on 33137 samples, validate on 11046 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.60176, saving model to model_cnn.hdf5
Epoch 2/3

Epoch 00002: val_acc improved from 0.60176 to 0.64657, saving model to model_cnn.hdf5
Epoch 3/3

Epoch 00003: val_acc improved from 0.64657 to 0.65816, saving model to model_cnn.hdf5


In [149]:
xtest_pred_prob_result_deep_cnn = model.predict(x_val)[:, 1]

## 5. Results:

In [150]:
test_pred = {
    "RandomForest":y_pred_prob_RandomForest,
    "logreg_cv":y_pred_prob_logreg_cv,
    "NN":y_pred_prob_NN,
    "XGB":y_pred_prob_clf_XGB,
    "adaboost":y_pred_prob_adaboost,
    "extratree": y_pred_prob_extratree,
    "deep_cnn": xtest_pred_prob_result_deep_cnn
}
df_test_pred = pd.DataFrame(test_pred)

In [151]:
df_test_pred.head(10)

Unnamed: 0,RandomForest,logreg_cv,NN,XGB,adaboost,extratree,deep_cnn
0,0.713676,0.777976,0.740634,0.657034,0.501595,0.82,0.479676
1,0.457048,0.382006,0.44249,0.35994,0.498207,0.27,1.0
2,0.554012,0.505793,0.461528,0.649797,0.501189,0.74,0.561898
3,0.644851,0.842044,0.808806,0.851214,0.503453,0.69,0.500754
4,0.596983,0.565473,0.513614,0.446266,0.500414,0.74,0.496739
5,0.467835,0.584228,0.608317,0.533376,0.499331,0.568667,0.494948
6,0.68313,0.881192,0.849912,0.745085,0.501021,0.79,0.469123
7,0.426103,0.310727,0.250585,0.275192,0.498028,0.23,0.457482
8,0.603898,0.845299,0.831312,0.663779,0.500685,0.87,1.0
9,0.593986,0.643697,0.654914,0.507463,0.500448,0.59,0.596445


In [152]:
# Duplicate preprocess steps for test data:
df_test["clean"] = df_test.Comment.apply(replace_contractions)
df_test["clean"]  = df_test["clean"].apply(nltk.word_tokenize)
df_test["clean"]  = df_test["clean"].apply(normalize)
df_test["clean"]  = df_test["clean"].apply(stem_and_lemmatize)
df_test["clean"]  = df_test["clean"].apply(TreebankWordDetokenizer().detokenize)
df_test.head(10)

Unnamed: 0,Comment,Id,clean
0,use variables in the outer function instead of...,68045,us vary out funct instead glob vary get best a...
1,if you're looking for something as nice as pyt...,60790,look someth nic python x not think luck stand...
2,"i use the tail() function: tail(vector, n=1) t...",53896,us tail funct tail vect n1 nic thing tail work...
3,clearly i should have worked on this for anoth...,50204,clear work anoth hour post quest obvy retrospe...
4,"you are, indeed, passing the object around and...",60771,indee pass object around us mem think op objec...
5,most of the algorithms for eigen value computa...,77143,algorithm eig valu comput scal bigoh n3 n rowc...
6,see tip 7 about adjusting the margins. excerpt...,69808,see tip adjust margin excerpt remov spac rese...
7,"see ?which.max > which.max( matrix[,2] ) [1] 2",78092,see whichmax whichmax matrix
8,rolling means/maximums/medians in the zoo pack...,64047,rol meansmaximumsm zoo pack rollm movingav ttr...
9,"for no good reason i'm aware of, dev.off(), un...",73281,good reason aw devoff unlik dev rel funct lik ...


In [153]:
# word level tf-idf
test_tfidf =  tfidf_vect.transform(df_test["clean"])

In [154]:
#y_pred_prob_result = nb.predict_proba(X_result_dtm)[:, 1]
y_pred_prob_result_RandomForest = rf_cv.predict_proba(test_tfidf)[:, 1]
y_pred_prob_result_logreg_cv = logreg_cv.predict_proba(test_tfidf)[:, 1]
y_pred_prob_result_NN = NN.predict_proba(test_tfidf)[:, 1]
y_pred_prob_result_clf_XGB = xgb_cv.predict_proba(test_tfidf)[:, 1]
y_pred_prob_result_adaboost = adaboost.predict_proba(test_tfidf)[:, 1]
y_pred_prob_result_extratree = extratree.predict_proba(test_tfidf)[:, 1]

In [155]:
sequences_test = tokenizer.texts_to_sequences(df_test["clean"])
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of Data Tensor:', data_test.shape)
y_pred_prob_result_deep_cnn = model.predict(data_test)[:, 1]

Shape of Data Tensor: (28200, 1000)


In [157]:
predictions_result = {
    "RandomForest":y_pred_prob_result_RandomForest,
    "logreg_cv":y_pred_prob_result_logreg_cv,
    "NN":y_pred_prob_result_NN,
    "XGB":y_pred_prob_result_clf_XGB,
    "adaboost":y_pred_prob_result_adaboost,
    "extratree":y_pred_prob_result_extratree,
    "deep_cnn":y_pred_prob_result_deep_cnn
}

In [159]:
df_pred_result = pd.DataFrame(predictions_result)
df_pred_result.head(10)

Unnamed: 0,RandomForest,logreg_cv,NN,XGB,adaboost,extratree,deep_cnn
0,0.53448,0.779456,0.754049,0.633159,0.500919,0.5,0.545244
1,0.565033,0.702514,0.677042,0.753,0.501089,0.68,0.578214
2,0.536936,0.62457,0.637022,0.657852,0.501919,0.65,0.520217
3,0.545158,0.618712,0.674909,0.396508,0.501036,0.49,0.590322
4,0.80938,0.851264,0.846711,0.764513,0.502041,0.84,1.0
5,0.675821,0.340298,0.345103,0.487131,0.500894,0.62,0.614686
6,0.611977,0.781798,0.790784,0.73399,0.503667,0.52,0.549541
7,0.513002,0.435319,0.375095,0.489146,0.500237,0.564,0.505566
8,0.482264,0.395722,0.350307,0.325369,0.497271,0.424,0.500445
9,0.5321,0.778034,0.783298,0.499852,0.500265,0.65,0.541041


In [161]:
df_pred_result.shape

(28200, 7)

In [163]:
# Train the Stacking model:
#split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_result_train, X_result_test, y_result_train, y_result_test = train_test_split(df_test_pred, valid_y, random_state=1)
print(X_result_train.shape)
print(X_result_test.shape)

(8284, 7)
(2762, 7)


In [164]:
from sklearn.model_selection import GridSearchCV
grid={"C":np.logspace(-3,1,5), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg_cv_result=GridSearchCV(logreg,grid,cv=5,verbose=10)
logreg_cv_result.fit(X_result_train, y_result_train)
print("tuned hpyerparameters :(best parameters) ",logreg_cv_result.best_params_)
print("accuracy :",logreg_cv_result.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] ................... C=0.001, penalty=l1, score=nan, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................... C=0.001, penalty=l1, score=nan, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................... C=0.001, penalty=l1, score=nan, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................... C=0.001, penalty=l1, score=nan, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................... C=0.001, penalty=l1, score=nan, total=   0.0s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.640, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] .................. C=0.01, penalty=l2, score=0.716, total=   0.0s
[CV] C=0.01, penalty=l2 ..............................................
[CV] .................. C=0.01, penalty=l2, score=0.710, total=   0.0s
[CV] C=0.01, penalty=l2 ..............................................
[CV] .................. C=0.01, penalty=l2, score=0.708, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ..................... C=0.1, penalty=l1, score=nan, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ................... C=1.0, penalty=l2, score=0.709, total=   0.0s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.721, total=   0.0s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ................... C=1.0, penalty=l2, score=0.719, total=   0.0s
[CV] C=10.0, penalty=l1 ..............................................
[CV] .................... C=10.0, penalty=l1, score=nan, total=   0.0s
[CV] C=10.0, penalty=l1 ..............................................
[CV] .................... C=10.0, penalty=l1, score=nan, total=   0.0s
[CV] C=10.0, penalty=l1 ..............................................
[CV] .................... C=10.0, penalty=l1, score=nan, total=   0.0s
[CV] C=10.0, penalty=l1 ..............................................
[CV] .................... C=10.0, penalty=l1, score=nan, total=   0.0s
[CV] C=10.0, penalty=l1 ..............................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.6s finished


[CV] .................. C=10.0, penalty=l2, score=0.719, total=   0.0s
tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
accuracy : 0.7113722634759868


In [165]:
new_pred_prob = logreg_cv_result.predict(df_pred_result)
import numpy as np

y_result_pred_class_ensemble = []
for i in range(len(new_pred_prob)):
    if new_pred_prob [i] > 0.5:
        y_result_pred_class_ensemble.append(1)
    else:
        y_result_pred_class_ensemble.append(0)

y_result_pred_class_ensemble = np.array(y_result_pred_class_ensemble)

In [166]:
result = pd.DataFrame ({
    "Id": df_test.Id,
    "Outcome": y_result_pred_class_ensemble
})

In [167]:
result.head()

Unnamed: 0,Id,Outcome
0,68045,1
1,60790,1
2,53896,1
3,50204,1
4,60771,1


In [168]:
result.to_csv('20200327_Stacking_v2.csv', index=False)