In [20]:
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [21]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()




# Building the dataframes

In [22]:
_dfs = [
    {
        'df': '../data/{}_features.csv',
        'cols': [
            'last_char',
             'avg_shared_words',
             'word_count_diff',
             'levenshtein',
             'shared_words_pcnt',
             'avg_shared_trigrams',
             'shared_bigram_pcnt',
             'shared_trigram_pcnt',
             'avg_shared_quadgrams',
             'shared_quadgram_pcnt',
             'shared_entities',
             'non_shared_entities',
        ]
    },
    {
        'df': '../data/{}_lstm_output.csv',
        'cols': [
            'nn_out'
        ]
    },
    {
        'df': '../data/tfidf_{}_features.csv',
        'cols': [
            'tfidf_word_match_share'
        ]
    },
#     {
#         'df': '../data/topic_modelling_output.csv',
#         'cols': []
#     },
    {
        'df': '../data/{}_with_sim_and_ents_long.csv',
        'cols': [
            'CARDINAL_1','DATE_1','EVENT_1','FAC_1','GPE_1','LANGUAGE_1','LAW_1','LOC_1','MONEY_1','NORP_1',
            'ORDINAL_1','ORG_1','PERCENT_1','PERSON_1','PRODUCT_1','QUANTITY_1','TIME_1','WORK_OF_ART_1',
            'CARDINAL_2','DATE_2','EVENT_2','FAC_2','GPE_2','LANGUAGE_2','LAW_2','LOC_2','MONEY_2','NORP_2',
            'ORDINAL_2','ORG_2','PERCENT_2','PERSON_2','PRODUCT_2','QUANTITY_2','TIME_2','WORK_OF_ART_2',
        ]
    },
]

In [64]:
def build_df(df, merge_col='id', _set='train'):
    """
    Creates the dataframe for either train or test set identically.
    
    Parameter
        df: base dataframe containing the ids and questions
        merge_col: specify the column name how to merge the dataframe together
        _set: pass either 'test' or 'train'
        
    Returns
        df: fully merged dataframe
    """
    for _df in _dfs:
        path = _df['df'].format(_set)
        df = df.merge(pd.read_csv(path).loc[:,[merge_col] + _df['cols']], on=merge_col, how='left')
        logging.info('Merged in {}'.format(path))
    return df

In [65]:
def build_results_set(df, preds_array, file_path):
    """
    Builds the csv in the format that can be uploaded to kaggle.
    
    Parameter
        df: test dataframe containing the test ids and that was used to make the predictions
        preds_array: the predicition array that was return by the model
        file_path: specify the path and file name to store the output csv
    """
    pd.DataFrame({"test_id": df['test_id'], "is_duplicate": preds_array}).to_csv(file_path, index=False)

### Training Set

In [56]:
df_train = pd.read_csv('../data/train_data.csv')
df_train = df_train.drop(['is_duplicate'], axis=1).merge(pd.read_csv('../data/train_labels.csv'), on='id')

In [57]:
df_train = build_df(df_train, merge_col='id', _set='train')

INFO:root:Merged in ../data/train_features.csv
INFO:root:Merged in ../data/train_lstm_output.csv
INFO:root:Merged in ../data/tfidf_train_features.csv
INFO:root:Merged in ../data/train_with_sim_and_ents_long.csv


### Test Set

In [40]:
df_test = pd.read_csv('../data/test_data.csv')

In [41]:
df_test = build_df(df_test, merge_col='test_id', _set='test')

INFO:root:Merged in ../data/test_features.csv
INFO:root:Merged in ../data/test_lstm_output.csv
INFO:root:Merged in ../data/tfidf_test_features.csv
INFO:root:Merged in ../data/test_with_sim_and_ents_long.csv


# Classification
1. Logistic Regression
2. Stepwise Logistic Regression
3. Decision Tree
4. Random Forest
5. SVM
6. Neural Network

### Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [60]:
y = df_train['is_duplicate']
x = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1)
x.head()

Unnamed: 0,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt,...,MONEY_2,NORP_2,ORDINAL_2,ORG_2,PERCENT_2,PERSON_2,PRODUCT_2,QUANTITY_2,TIME_2,WORK_OF_ART_2
0,1.0,12.0,2.0,0.926829,0.923077,9.0,0.833333,0.818182,8.0,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,4.0,5.0,0.647482,0.380952,0.0,0.105263,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2,1.0,4.0,4.0,0.454545,0.333333,0.0,0.090909,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,2.0,0.069565,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2.0,6.0,0.365217,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
lm = sm.Logit(y, x)
result = lm.fit()
# print(result.summary())

Optimization terminated successfully.
         Current function value: nan
         Iterations 1
