**Objective: ** 
Predicting if a question asked on Quora is sincere or not.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import re, string, unicodedata
import os
import time
start_time = time.time()
print(os.listdir("../input"))

In [None]:
# Data Import
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
# Shape for train and test
print("--- %s seconds for Data Loading ---" % (time.time() - start_time))
print('Shape of train:',train.shape)
print('Shape of test:',test.shape)
train['Dataset'] = 'train'
test['Dataset'] = 'test'

all_data =  pd.concat([train, test], axis= 0, ignore_index= True)

**Pre-Processing**
Using the library NLTK, we are going to start with text pre-processing.  It is predominantly comprised of three steps:
* Noise Removal
* Lexicon Normalization
* Object Standardization
To go into more detail we have taken the following steps to clean and process the text:
* Create Tokens
* Remove Non-Ascii characters, convert to lower case, remove punctuation and whitespace. Replace numbers with their words equivalent.
* Removing Stop words: NLTK corpus contains 179 stop words such as "for", "having", "yours" and so on.
* Stem / Lemmenize words

In [None]:
import nltk
#!pip install inflect #Make sure the Kernal has Internet Connected (Check Settings)
#import inflect
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer 

In [None]:
eng_stopwords = set(stopwords.words("english"))
## Number of words in the text ##
all_data["num_words"] = all_data["question_text"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
all_data["num_unique_words"] = all_data["question_text"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
all_data["num_chars"] = all_data["question_text"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
all_data["num_stopwords"] = all_data["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the text ##
all_data["num_punctuations"] =all_data['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
all_data["num_words_upper"] = all_data["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
all_data["num_words_title"] = all_data["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
all_data["mean_word_len"] = all_data["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
#lower case
all_data['question_text'] = all_data['question_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#Removing Punctuation
all_data['question_text'] = all_data['question_text'].str.replace('[^\w\s]','')
#Removing numbers
all_data['question_text'] = all_data['question_text'].str.replace('[0-9]','')
#Remooving stop words and words with length <=2
from nltk.corpus import stopwords
stop = stopwords.words('english')
all_data['question_text'] = all_data['question_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop and len(x)>2))
# Lemmatize
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
all_data['question_text'] = all_data['question_text'].apply(lambda x: " ".join(wl.lemmatize(x,'v') for x in x.split()))

In [None]:
print("--- %s seconds for Data Transformation ---" % (time.time() - start_time))

In [None]:
from tqdm import tqdm, tqdm_notebook
# NLTK sentiment cell
print('\nGetting sentiments...')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()
sentiments = np.zeros(len(all_data))

for i, (_, row) in tqdm_notebook(enumerate(all_data.iterrows()), total=len(all_data)):
    sentiments[i] = sia.polarity_scores(row.question_text)['compound']
# Bringing Sentiment scores to [0,1] range
all_data['sentiment'] = pd.Series(sentiments)
all_data['sentiment_target'] = (all_data['sentiment'] + 1) / 2

print("--- %s seconds for Adding Sentiment ---" % (time.time() - start_time))

In [None]:
print(train.shape)
train = all_data[all_data.Dataset == 'train']
print(train.shape)

LDA

In [None]:
import gensim
from gensim import corpora
dct = corpora.Dictionary(  nltk.word_tokenize(i) for i in  train.question_text )
dct.filter_extremes(no_below=20, no_above=0.5)

topic_cnt = 100

# Reindexes the remaining words after filtering
dct.compactify()
print("Left with {} words.".format(len(dct.values())))

all_data["question_text_tokens"] = list(map(nltk.word_tokenize, all_data.question_text))

#Make a BOW for every document
def document_to_bow(df):
    df['bow'] = list(map(lambda doc: dct.doc2bow(doc), df.question_text_tokens))
    
document_to_bow(all_data)

train = all_data[all_data.Dataset == 'train']

print("Created BOW")
# model imports
from gensim.models.ldamulticore import LdaMulticore
corpus = train.bow
num_topics = topic_cnt
#A multicore approach to decrease training time
LDAmodel = LdaMulticore(corpus=corpus,
                        id2word=dct,
                        num_topics=num_topics,
                        workers=4,
                        chunksize=4000,
                        passes=7,
                        alpha='asymmetric')
print("--- %s seconds for LDA ---" % (time.time() - start_time))

In [None]:
def document_to_lda_features(lda_model, document):
    """ Transforms a bag of words document to features.
    It returns the proportion of how much each topic was
    present in the document.
    """
    topic_importances = LDAmodel.get_document_topics(document, minimum_probability=0)
    topic_importances = np.array(topic_importances)
    return topic_importances[:,1]

all_data['lda_features'] = list(map(lambda doc:
                                      document_to_lda_features(LDAmodel, doc),
                                      all_data.bow))


print("--- %s seconds for LDA to Features ---" % (time.time() - start_time))

In [None]:
all_data["lda_features"][all_data["lda_features"].isnull()] = all_data["lda_features"].apply( lambda d: d if isinstance(d, list) else [0] *topic_cnt  ) 
all_data = pd.concat([all_data,pd.DataFrame(np.array(list(map(np.array, all_data.lda_features))))], axis = 1)

In [None]:
print(train.shape)
train = all_data[all_data.Dataset == 'train']
print(train.shape)

In [None]:
Insincere_topic_distribution = train.loc[train.target == 1, 'lda_features'].mean()
Sincere_topic_distribution = train.loc[train.target == 0, 'lda_features'].mean()

def get_topic_top_words(lda_model, topic_id, nr_top_words=20):
    """ Returns the top words for topic_id from lda_model.
    """
    id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)
    word_ids = np.array(id_tuples)[:,0]
    words = map(lambda id_: lda_model.id2word[id_], word_ids)
    return words

top_topics = []
for target, distribution in zip([1,0], [Insincere_topic_distribution, Sincere_topic_distribution]):
    for x in sorted(np.argsort(distribution)[-5:]):
        top_topics.append(x)
        
# Get the top 15 words from each topic
topic_words = []
for i in top_topics:
    terms = LDAmodel.get_topic_terms(i,15)
    topic_words.append([dct[pair[0]] for pair in terms])

# Create an empty data frame
topic_cols = []
for i in top_topics:
    topic_cols.append("Topic_%d_count" % (i))

topic_count = pd.DataFrame( index = range(len(all_data.index)) ,columns = topic_cols)

for i in range( topic_count.shape[1]):
    topic_count[topic_count.columns[i]] = all_data['question_text_tokens'].apply(lambda x: len(  list(set( (x)  ) & set( topic_words[i]    ) )     )   )

print("--- %s seconds for get the number of words in question i from topic j as a feature ---" % (time.time() - start_time))    
# Get the number of words in question i from topic j as a feature

all_data = pd.concat([ all_data, topic_count], axis=1)
del topic_count

In [None]:
print("Modeling Section")

In [None]:
del LDAmodel

**Modeling Section:**


In [None]:
print(test.shape)
print(train.shape)
test = all_data[all_data.Dataset == 'test']
print(test.shape)
train = all_data[all_data.Dataset == 'train']
print(train.shape)

In [None]:
col_rm_list = ['qid','question_text','bow','Dataset','sentiment','target','question_text_tokens','lda_features']
topic_features = [col for col in all_data.filter(regex='Topic').columns ]
topic_features = list(set(topic_features))
other_features = ['num_words', 'num_unique_words', 'num_chars', 
                'num_stopwords', 'num_punctuations', 'num_words_upper', 
                'num_words_title', 'mean_word_len','sentiment_target']
eng_features = [col for col in train.columns if col not in (col_rm_list + topic_features)]

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

Model With Basic Features + Sentiment

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_ots = 0
oof_pred_ots = np.zeros([train.shape[0],])

train_target = train['target'].values

x_test = test[other_features].values
for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train.loc[train_index][other_features].values, train.loc[val_index][other_features].values
    y_train, y_val = train_target[train_index], train_target[val_index]
    classifier = LogisticRegression(C= 0.1)
    classifier.fit(x_train, y_train)
    val_preds = classifier.predict_proba(x_val)[:,1]
    preds = classifier.predict_proba(x_test)[:,1]
    test_pred_ots += 0.2*preds
    oof_pred_ots[val_index] = val_preds
print("--- %s seconds for Model Other Features ---" % (time.time() - start_time))

In [None]:
from sklearn import metrics
thresh_opt_ots = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_ots.astype(float) >thresh).astype(int))
    #print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_ots = thresh
print(thresh_opt_ots)
pred_train_ots = (oof_pred_ots > thresh_opt_ots).astype(np.int)
f1_score(train_target, pred_train_ots)

Model with Topic Counts

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_tps = 0
oof_pred_tps = np.zeros([train.shape[0],])

train_target = train['target'].values

x_test = test[topic_features].values
for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train.loc[train_index][topic_features].values, train.loc[val_index][topic_features].values
    y_train, y_val = train_target[train_index], train_target[val_index]
    classifier = LogisticRegression(C= 0.1)
    classifier.fit(x_train, y_train)
    val_preds = classifier.predict_proba(x_val)[:,1]
    preds = classifier.predict_proba(x_test)[:,1]
    test_pred_tps += 0.2*preds
    oof_pred_tps[val_index] = val_preds
print("--- %s seconds for Model Topic Count Features ---" % (time.time() - start_time))

In [None]:
from sklearn import metrics
thresh_opt_tps = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_tps.astype(float) >thresh).astype(int))
    #print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_tps = thresh
print(thresh_opt_tps)
pred_train_tps = (oof_pred_tps > thresh_opt_tps).astype(np.int)
f1_score(train_target, pred_train_tps)

Model For LDA

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_lda = 0
oof_pred_lda = np.zeros([train.shape[0],])

train_target = train['target'].values

x_test = test[eng_features].values
for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train.loc[train_index][eng_features].values, train.loc[val_index][eng_features].values
    y_train, y_val = train_target[train_index], train_target[val_index]
    classifier = LogisticRegression(C= 0.1)
    classifier.fit(x_train, y_train)
    val_preds = classifier.predict_proba(x_val)[:,1]
    preds = classifier.predict_proba(x_test)[:,1]
    test_pred_lda += 0.2*preds
    oof_pred_lda[val_index] = val_preds
print("--- %s seconds for Model LDA ---" % (time.time() - start_time))

In [None]:
from sklearn import metrics
thresh_opt_lda = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_lda.astype(float) >thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_lda = thresh
print(thresh_opt_lda)

In [None]:
pred_train_lda = (oof_pred_lda > thresh_opt_lda).astype(np.int)
f1_score(train_target, pred_train_lda)

In [None]:
## Clean UP
all_data = all_data[col_rm_list]
all_data = all_data.drop(['sentiment','lda_features'], axis = 1)
print(test.shape)
print(train.shape)
test = all_data[all_data.Dataset == 'test']
print(test.shape)
train = all_data[all_data.Dataset == 'train']
print(train.shape)

Model TFIDF 1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3),
    max_features=10000)

word_vectorizer.fit(all_data.question_text)
del all_data
train_word_features = word_vectorizer.transform(train.question_text)
test_word_features = word_vectorizer.transform(test.question_text)

In [None]:

train_target = train['target'].values

kf = KFold(n_splits=5, shuffle=True, random_state=187)
test_pred_tf = 0
oof_pred_tf = np.zeros([train.shape[0],])

for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train_word_features[train_index,:], train_word_features[val_index,:]
    y_train, y_val = train_target[train_index], train_target[val_index]
    classifier = LogisticRegression(class_weight = "balanced", C=0.5, solver='sag')
    classifier.fit(x_train, y_train)
    val_preds = classifier.predict_proba(x_val)[:,1]
    preds = classifier.predict_proba(test_word_features)[:,1]
    test_pred_tf += 0.2*preds
    oof_pred_tf[val_index] = val_preds
print("--- %s seconds for Model TFIDF 1 ---" % (time.time() - start_time))

In [None]:
from sklearn import metrics
thresh_opt = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_tf.astype(float) >thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt = thresh
print(thresh_opt)

In [None]:
pred_train = (oof_pred_tf > thresh_opt).astype(np.int)
f1_score(train_target, pred_train)

Model TFIDF 2

In [None]:
import lightgbm as lgb

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

params = {'learning_rate': 0.05,
          'application': 'binary',
          'max_depth': 9,
          'num_leaves': 100,
          'verbosity': -1,
          'metric': 'lgb_f1_score',
          'data_random_seed': 3,
          'bagging_fraction': 0.8,
          'feature_fraction': 0.4,
          #'nthread': 16,
          'lambda_l1': 1,
          'lambda_l2': 1,
          'num_rounds': 2700,
          'verbose_eval': 100}


kf = KFold(n_splits=5, shuffle=True, random_state=420)
test_pred_lgb = 0
oof_pred_lgb = np.zeros([train.shape[0],])

for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train_word_features[train_index,:], train_word_features[val_index,:]
    y_train, y_val = train_target[train_index], train_target[val_index]
    
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_val, label=y_val)

    num_rounds = 2500
    model = lgb.train(params,
                  train_set=d_train,
                  num_boost_round=num_rounds,
                  valid_sets=[d_train, d_valid],
                  valid_names=['train', 'val'],
                  verbose_eval=0)
    
    val_preds = model.predict(x_val)
    preds = classifier.predict(test_word_features)
    test_pred_lgb += 0.2*preds
    oof_pred_lgb[val_index] = val_preds
print("--- %s seconds for Model 2 ---" % (time.time() - start_time))    

In [None]:
from sklearn import metrics
thresh_opt_lgb = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_lgb.astype(float) >thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_lgb = thresh
print(thresh_opt_lgb)

In [None]:
pred_train_lgb = (oof_pred_lgb > thresh_opt_lgb).astype(np.int)
f1_score(train_target, pred_train_lgb)

Model Count Vectorizer

In [None]:
# Count Vectorizor
from sklearn.feature_extraction.text import CountVectorizer 

bow = CountVectorizer()

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_cv = 0
oof_pred_cv = np.zeros([train.shape[0],])


for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train.loc[train_index]['question_text'].values, train.loc[val_index]['question_text'].values
    y_train, y_val = train_target[train_index], train_target[val_index]
    x_test = test['question_text'].values
    
    bow = CountVectorizer()
    x_train = bow.fit_transform(x_train)
    x_val = bow.transform(x_val)
    x_test = bow.transform(x_test)

    classifier = LogisticRegression(penalty = "l1", C = 1.25, class_weight = "balanced")
    
    classifier.fit(x_train, y_train)
    val_preds = classifier.predict_proba(x_val)[:,1]
    preds = classifier.predict_proba(x_test)[:,1]
    test_pred_cv += 0.2*preds
    oof_pred_cv[val_index] = val_preds
print("--- %s seconds for Model 3 ---" % (time.time() - start_time))    

In [None]:
from sklearn import metrics
thresh_opt_cv = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_cv.astype(float) >thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_cv = thresh
print(thresh_opt_cv)

In [None]:
pred_train_cv = (oof_pred_cv > thresh_opt_cv).astype(np.int)
f1_score(train_target, pred_train_cv)

NB Count Vectorizer

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_cv_2 = 0
oof_pred_cv_2 = np.zeros([train.shape[0],])
test_pred_cv_3 = 0
oof_pred_cv_3 = np.zeros([train.shape[0],])


for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = train.loc[train_index]['question_text'].values, train.loc[val_index]['question_text'].values
    y_train, y_val = train_target[train_index], train_target[val_index]
    x_test = test['question_text'].values
    
    bow = CountVectorizer()
    x_train = bow.fit_transform(x_train)
    x_val = bow.transform(x_val)
    x_test = bow.transform(x_test)
    
    classifier2 = MultinomialNB()
    classifier3 = BernoulliNB()
    
    classifier2.fit(x_train, y_train)
    val_preds = classifier2.predict_proba(x_val)[:,1]
    preds = classifier2.predict_proba(x_test)[:,1]
    test_pred_cv_2 += 0.2*preds
    oof_pred_cv_2[val_index] = val_preds
    
    classifier3.fit(x_train, y_train)
    val_preds = classifier3.predict_proba(x_val)[:,1]
    preds = classifier3.predict_proba(x_test)[:,1]
    test_pred_cv_3 += 0.2*preds
    oof_pred_cv_3[val_index] = val_preds

In [None]:
from sklearn import metrics
thresh_opt_cv_2 = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_cv_2.astype(float) >thresh).astype(int))
    #print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_cv_2 = thresh
print(thresh_opt_cv_2)
pred_train_cv_2 = (oof_pred_cv_2 > thresh_opt_cv_2).astype(np.int)
f1_score(train_target, pred_train_cv_2)

In [None]:
from sklearn import metrics
thresh_opt_cv_3 = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_cv_3.astype(float) >thresh).astype(int))
    #print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_cv_3 = thresh
print(thresh_opt_cv_3)
pred_train_cv_2 = (oof_pred_cv_3 > thresh_opt_cv_3).astype(np.int)
f1_score(train_target, pred_train_cv_2)

Stacking:

In [None]:
stack_train = np.hstack((oof_pred_ots.reshape(-1,1),oof_pred_tps.reshape(-1,1),oof_pred_lda.reshape(-1,1),oof_pred_tf.reshape(-1,1), oof_pred_lgb.reshape(-1,1), 
                         oof_pred_cv.reshape(-1,1),oof_pred_cv_2.reshape(-1,1),oof_pred_cv_3.reshape(-1,1)))
stack_test = np.hstack((test_pred_ots.reshape(-1,1),test_pred_tps.reshape(-1,1),test_pred_lda.reshape(-1,1),test_pred_tf.reshape(-1,1), test_pred_lgb.reshape(-1,1), 
                         test_pred_cv.reshape(-1,1),test_pred_cv_2.reshape(-1,1),test_pred_cv_3.reshape(-1,1)))

print(stack_train.shape)
print(stack_test.shape)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_stack = 0
oof_pred_stack = np.zeros([train.shape[0],])

for i, (train_index, val_index) in tqdm(enumerate(kf.split(train))):
    x_train, x_val = stack_train[train_index,:], stack_train[val_index,:]
    y_train, y_val = train_target[train_index], train_target[val_index]
    classifier = LogisticRegression(class_weight = "balanced", C=0.5, solver='sag')
    classifier.fit(x_train, y_train)
    val_preds = classifier.predict_proba(x_val)[:,1]
    preds = classifier.predict_proba(stack_test)[:,1]
    test_pred_stack += 0.2*preds
    oof_pred_stack[val_index] = val_preds

In [None]:
from sklearn import metrics
thresh_opt_stack = 0.5
f1_opt = 0
for thresh in np.arange(0.1, 0.91, 0.05):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(train_target, (oof_pred_stack.astype(float) >thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1))
    if f1_opt < f1:
        f1_opt = f1
        thresh_opt_stack = thresh
print(thresh_opt_stack)

In [None]:
pred_train_stack = (oof_pred_stack > thresh_opt_stack).astype(np.int)
f1_score(train_target, pred_train_stack)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(train_target, pred_train_stack,labels = [1,0])

In [None]:
pred_test_final = ( test_pred_stack> thresh_opt_stack).astype(np.int)
submission = pd.DataFrame.from_dict({'qid': test['qid']})
submission['prediction'] = pred_test_final


In [None]:
print(np.sum(submission.prediction)/(stack_test.shape[0]))
submission.to_csv('submission.csv', index=False)