In [5]:
import sys
sys.path.insert(0, '..')

In [None]:
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

from helpers import save_model

In [None]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()

# Building the dataframes
Define functions to enable easy building of dataframes

In [8]:
_dfs = [
    {
        'df': '../data/{}_features.csv',
        'cols': [
            'last_char',
             'avg_shared_words',
             'word_count_diff',
             'levenshtein',
             'shared_words_pcnt',
             'avg_shared_trigrams',
             'shared_bigram_pcnt',
             'shared_trigram_pcnt',
             'avg_shared_quadgrams',
             'shared_quadgram_pcnt',
             'shared_entities',
             'non_shared_entities',
        ]
    },
    {
        'df': '../data/{}_lstm_output.csv',
        'cols': [
            'nn_out'
        ]
    },
    {
        'df': '../data/tfidf_{}_features.csv',
        'cols': [
            'tfidf_word_match_share'
        ]
    },
     {
         'df': '../data/topic_modelling_output.csv',
         'cols': []
     },
    {
        'df': '../data/{}_with_sim_and_ents_long.csv',
        'cols': [
            'CARDINAL_1','DATE_1','EVENT_1','FAC_1','GPE_1','LANGUAGE_1','LAW_1','LOC_1','MONEY_1','NORP_1',
            'ORDINAL_1','ORG_1','PERCENT_1','PERSON_1','PRODUCT_1','QUANTITY_1','TIME_1','WORK_OF_ART_1',
            'CARDINAL_2','DATE_2','EVENT_2','FAC_2','GPE_2','LANGUAGE_2','LAW_2','LOC_2','MONEY_2','NORP_2',
            'ORDINAL_2','ORG_2','PERCENT_2','PERSON_2','PRODUCT_2','QUANTITY_2','TIME_2','WORK_OF_ART_2',
        ]
    },
]

In [9]:
def build_df(df, merge_col='id', _set='train'):
    """
    Creates the dataframe for either train or test set identically.
    
    Parameter
        df: base dataframe containing the ids and questions
        merge_col: specify the column name how to merge the dataframe together
        _set: pass either 'test' or 'train'
        
    Returns
        df: fully merged dataframe
    """
    for _df in _dfs:
        path = _df['df'].format(_set)
        df = df.merge(pd.read_csv(path).loc[:,[merge_col] + _df['cols']], on=merge_col, how='left')
        logging.info('Merged in {}'.format(path))
    return df

In [10]:
def build_results_set(df, preds_array, file_path):
    """
    Builds the csv in the format that can be uploaded to kaggle.
    
    Parameter
        df: test dataframe containing the test ids and that was used to make the predictions
        preds_array: the predicition array that was return by the model
        file_path: specify the path and file name to store the output csv
    """
    p = pd.DataFrame({"test_id": df['test_id']})
    p['is_duplicate'] = preds_array
    p['is_duplicate'] = np.around(p['is_duplicate'].values)
    p.is_duplicate = p.is_duplicate.astype(int)
    p.to_csv(file_path, index=False)

### Import Training Set

In [11]:
df_train = pd.read_csv('../data/train_data.csv')
df_train = df_train.drop(['is_duplicate'], axis=1).merge(pd.read_csv('../data/train_labels.csv'), on='id')

In [None]:
df_train = build_df(df_train, merge_col='id', _set='train')
df_train = df_train.fillna(0)

### Import Test Set

In [13]:
df_test = pd.read_csv('../data/test_data.csv')

In [None]:
df_test = build_df(df_test, merge_col='test_id', _set='test')
df_test = df_test.fillna(0)

# Classification

### Logistic Regression
Classify test data using logistic regression

In [24]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [32]:
y = df_train['is_duplicate']
x = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1)
x.head()

Unnamed: 0,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt,...,MONEY_2,NORP_2,ORDINAL_2,ORG_2,PERCENT_2,PERSON_2,PRODUCT_2,QUANTITY_2,TIME_2,WORK_OF_ART_2
0,1.0,12.0,2.0,0.926829,0.923077,9.0,0.833333,0.818182,8.0,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,4.0,5.0,0.647482,0.380952,0.0,0.105263,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2,1.0,4.0,4.0,0.454545,0.333333,0.0,0.090909,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,2.0,0.069565,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2.0,6.0,0.365217,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
lm = sm.Logit(y, x)
result = lm.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.150956
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:           is_duplicate   No. Observations:               323164
Model:                          Logit   Df Residuals:                   323114
Method:                           MLE   Df Model:                           49
Date:                Sun, 03 Dec 2017   Pseudo R-squ.:                  0.7707
Time:                        14:36:28   Log-Likelihood:                -48784.
converged:                       True   LL-Null:                   -2.1275e+05
                                        LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
last_char                 -3.4974      0.036    -95.851      0.000      -3.569      

In [31]:
lm.predict(df_test.drop(['test_id', 'question1', 'question2'], axis=1).values)

ValueError: shapes (323164,50) and (81126,100) not aligned: 50 (dim 1) != 81126 (dim 0)

### Neural Network
Classify test data using neural network

Built in Keras

In [127]:
from keras import models
from keras import layers

In [128]:
nfeatures = 50

y = df_train['is_duplicate'].values
X = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1).values

In [129]:
nn = models.Sequential()
nn.add(layers.Dense(units=200, activation='relu', input_shape=(nfeatures,)))
nn.add(layers.Dense(units=50, activation='relu'))
nn.add(layers.Dense(units=1, activation='sigmoid'))

In [130]:
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
hist = nn.fit(X, y, epochs=100, verbose=1, batch_size=1000, validation_split = 1/3.0)

In [134]:
y_pred = nn.predict_classes(df_test.drop(['test_id', 'question1', 'question2'], axis=1).values)
build_results_set(df_test, y_pred, 'nn_binary_classification3.csv')

In [None]:
save_model(network, '../models/nn-classification-200-50-dense/')

### SVM
Clasify test data using SVM

In [16]:
from sklearn import svm
from sklearn.model_selection import train_test_split

In [27]:
y = df_train['is_duplicate']
X = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [28]:
sv = svm.SVC(kernel='linear', C=1, gamma=1) 

In [None]:
sv.fit(X_train.values, y_train.values)
sv.score(X_train.values, y_train.values)

In [None]:
y_pred = sv.predict(X_test.values)
print(r2_score(y_test.values, y_pred))
print(mean_squared_error(y_test.values, y_pred))

In [None]:
y_pred = sv.predict(df_test.drop(['test_id', 'question1', 'question2'], axis=1).values)
build_results_set(df_test, y_pred, 'svm_classification1.csv')

### Random Forest
Classify test data using random forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error

In [20]:
y = df_train['is_duplicate']
X = df_train.drop(['id', 'question1', 'question2', 'is_duplicate'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
forest = RandomForestClassifier(max_depth=2, random_state=0)
forest.fit(X_train.values, y_train.values)
print(forest.feature_importances_)

[ 0.          0.00734771  0.07335273  0.30867292  0.06791558  0.02375899
  0.07877609  0.00463782  0.02962118  0.00840603  0.00072106  0.03134265
  0.17746575  0.18340498  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.0042285
  0.          0.          0.          0.          0.          0.00034801
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.        ]


In [22]:
y_pred = forest.predict(X_test.values)
print(r2_score(y_test.values, y_pred))
print(mean_squared_error(y_test.values, y_pred))

0.648709281351
0.0818510009846


In [26]:
y_pred = forest.predict(df_test.drop(['test_id', 'question1', 'question2'], axis=1).values)
build_results_set(df_test, y_pred, 'randomforest_classification1.csv')