In [123]:
import sys
sys.path.insert(0, '..')

In [124]:
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

from helpers import save_model

In [2]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()




# Building the dataframes

In [3]:
_dfs = [
    {
        'df': '../data/{}_features.csv',
        'cols': [
            'last_char',
             'avg_shared_words',
             'word_count_diff',
             'levenshtein',
             'shared_words_pcnt',
             'avg_shared_trigrams',
             'shared_bigram_pcnt',
             'shared_trigram_pcnt',
             'avg_shared_quadgrams',
             'shared_quadgram_pcnt',
             'shared_entities',
             'non_shared_entities',
        ]
    },
    {
        'df': '../data/{}_lstm_output.csv',
        'cols': [
            'nn_out'
        ]
    },
    {
        'df': '../data/tfidf_{}_features.csv',
        'cols': [
            'tfidf_word_match_share'
        ]
    },
#     {
#         'df': '../data/topic_modelling_output.csv',
#         'cols': []
#     },
    {
        'df': '../data/{}_with_sim_and_ents_long.csv',
        'cols': [
            'CARDINAL_1','DATE_1','EVENT_1','FAC_1','GPE_1','LANGUAGE_1','LAW_1','LOC_1','MONEY_1','NORP_1',
            'ORDINAL_1','ORG_1','PERCENT_1','PERSON_1','PRODUCT_1','QUANTITY_1','TIME_1','WORK_OF_ART_1',
            'CARDINAL_2','DATE_2','EVENT_2','FAC_2','GPE_2','LANGUAGE_2','LAW_2','LOC_2','MONEY_2','NORP_2',
            'ORDINAL_2','ORG_2','PERCENT_2','PERSON_2','PRODUCT_2','QUANTITY_2','TIME_2','WORK_OF_ART_2',
        ]
    },
]

In [4]:
def build_df(df, merge_col='id', _set='train'):
    """
    Creates the dataframe for either train or test set identically.
    
    Parameter
        df: base dataframe containing the ids and questions
        merge_col: specify the column name how to merge the dataframe together
        _set: pass either 'test' or 'train'
        
    Returns
        df: fully merged dataframe
    """
    for _df in _dfs:
        path = _df['df'].format(_set)
        df = df.merge(pd.read_csv(path).loc[:,[merge_col] + _df['cols']], on=merge_col, how='left')
        logging.info('Merged in {}'.format(path))
    return df

In [98]:
def build_results_set(df, preds_array, file_path):
    """
    Builds the csv in the format that can be uploaded to kaggle.
    
    Parameter
        df: test dataframe containing the test ids and that was used to make the predictions
        preds_array: the predicition array that was return by the model
        file_path: specify the path and file name to store the output csv
    """
    p = pd.DataFrame({"test_id": df['test_id']})
    p['is_duplicate'] = preds_array
    p['is_duplicate'] = np.around(p['is_duplicate'].values)
    p.is_duplicate = p.is_duplicate.astype(int)
    p.to_csv(file_path, index=False)

### Training Set

In [6]:
df_train = pd.read_csv('../data/train_data.csv')
df_train = df_train.drop(['is_duplicate'], axis=1).merge(pd.read_csv('../data/train_labels.csv'), on='id')

In [7]:
df_train = build_df(df_train, merge_col='id', _set='train')
df_train = df_train.fillna(0)

INFO:root:Merged in ../data/train_features.csv
INFO:root:Merged in ../data/train_lstm_output.csv
INFO:root:Merged in ../data/tfidf_train_features.csv
INFO:root:Merged in ../data/train_with_sim_and_ents_long.csv


### Test Set

In [78]:
df_test = pd.read_csv('../data/test_data.csv')

In [79]:
df_test = build_df(df_test, merge_col='test_id', _set='test')
df_test = df_test.fillna(0)

INFO:root:Merged in ../data/test_features.csv
INFO:root:Merged in ../data/test_lstm_output.csv
INFO:root:Merged in ../data/tfidf_test_features.csv
INFO:root:Merged in ../data/test_with_sim_and_ents_long.csv


# Classification
1. Logistic Regression
2. Stepwise Logistic Regression
3. Decision Tree
4. Random Forest
5. SVM
6. Neural Network

In [80]:
df_test.describe()

Unnamed: 0,test_id,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,...,MONEY_2,NORP_2,ORDINAL_2,ORG_2,PERCENT_2,PERSON_2,PRODUCT_2,QUANTITY_2,TIME_2,WORK_OF_ART_2
count,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,...,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0
mean,201935.133447,0.980426,4.784866,3.684787,0.578928,0.452134,1.358985,0.246322,0.153545,0.870116,...,0.005239,0.05823,0.013571,0.203794,0.002404,0.117139,0.016924,0.008431,0.007692,0.013756
std,116366.394811,0.138534,3.449371,4.825042,0.218786,0.257994,2.654754,0.260056,0.239421,2.303912,...,0.078568,0.283124,0.121828,0.504203,0.058595,0.379341,0.143036,0.104637,0.096356,0.120433
min,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,101527.0,1.0,3.0,1.0,0.404145,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,202071.5,1.0,4.0,2.0,0.581197,0.444444,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,301947.75,1.0,6.0,5.0,0.755556,0.647059,2.0,0.4,0.235294,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,404278.0,1.0,50.0,213.0,1.0,1.285714,42.0,1.166667,1.0,40.0,...,3.0,6.0,3.0,7.0,4.0,8.0,4.0,6.0,4.0,3.0


### Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [32]:
y = df_train['is_duplicate']
x = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1)
x.head()

Unnamed: 0,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt,...,MONEY_2,NORP_2,ORDINAL_2,ORG_2,PERCENT_2,PERSON_2,PRODUCT_2,QUANTITY_2,TIME_2,WORK_OF_ART_2
0,1.0,12.0,2.0,0.926829,0.923077,9.0,0.833333,0.818182,8.0,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,4.0,5.0,0.647482,0.380952,0.0,0.105263,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2,1.0,4.0,4.0,0.454545,0.333333,0.0,0.090909,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,2.0,0.069565,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2.0,6.0,0.365217,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
lm = sm.Logit(y, x)
result = lm.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.150956
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:           is_duplicate   No. Observations:               323164
Model:                          Logit   Df Residuals:                   323114
Method:                           MLE   Df Model:                           49
Date:                Sun, 03 Dec 2017   Pseudo R-squ.:                  0.7707
Time:                        14:36:28   Log-Likelihood:                -48784.
converged:                       True   LL-Null:                   -2.1275e+05
                                        LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
last_char                 -3.4974      0.036    -95.851      0.000      -3.569      

In [31]:
lm.predict(df_test.drop(['test_id', 'question1', 'question2'], axis=1).values)

ValueError: shapes (323164,50) and (81126,100) not aligned: 50 (dim 1) != 81126 (dim 0)

### Neural Network

In [127]:
from keras import models
from keras import layers

In [128]:
nfeatures = 50

y = df_train['is_duplicate'].values
X = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1).values

In [129]:
nn = models.Sequential()
nn.add(layers.Dense(units=200, activation='relu', input_shape=(nfeatures,)))
nn.add(layers.Dense(units=50, activation='relu'))
nn.add(layers.Dense(units=1, activation='sigmoid'))

In [130]:
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [132]:
hist = nn.fit(X, y, epochs=100, verbose=1, batch_size=1000, validation_split = 1/3.0)

Train on 215442 samples, validate on 107722 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [134]:
y_pred = nn.predict_classes(df_test.drop(['test_id', 'question1', 'question2'], axis=1).values)
build_results_set(df_test, y_pred, 'nn_binary_classification3.csv')

In [135]:
save_model(network, '../models/nn-classification-200-50-dense/')

INFO:root:Generating weights
INFO:root:Saved weights to ../models/nn-classification-200-50-densemodel
INFO:root:Saved model configs to ../models/nn-classification-200-50-denseconfig.json


### SVM

In [139]:
from sklearn import svm
from sklearn.model_selection import train_test_split

In [137]:
y = df_train['is_duplicate']
X = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [142]:
model = svm.SVC(kernel='linear', C=1, gamma=1) 
# there is various option associated with it, like changing kernel, gamma and C value. Will discuss more # about it in next section.Train the model using the training sets and check score

In [None]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

In [None]:
predicted = model.predict(x_test)