In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('./train.csv')

In [3]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
# remove NaN values in question2 column
data = data.ix[data.question2.notnull()]

In [5]:
from nltk.corpus import stopwords

In [6]:
# get english stopwords from nltk package
stops = set(stopwords.words("english"))

In [7]:
# import useful libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [8]:
# function to return data of specific column
def get_question(data, column):
    return data.loc[:,column]

In [129]:
# pipeline to process data + classifiction training
rf_pipeline = Pipeline([
                # first process question1 and question2 independently
                ('union', FeatureUnion([
                            # process question1
                            ('process_q1', Pipeline([
                                            # get data from column 'question1'
                                            ('get_q1', FunctionTransformer(get_question,
                                                                           kw_args={'column':'question1'},
                                                                           validate=False)),
                                            # CountVectorizer + TfIdf with TfidfVectorizer
                                            # stop_words from nltk data, max_features to use,
                                            # and max_df to remove any repeating patterns that are not
                                            # capture by stop_words
                                            ('vectorizer1', TfidfVectorizer(stop_words=stops, 
                                                                            max_features=100,
                                                                            max_df=0.8))])),
                            # do the same with 'question2' column
                            ('process_q2', Pipeline([
                                            ('get_q2', FunctionTransformer(get_question,
                                                                           kw_args={'column':'question2'},
                                                                           validate=False)),
                                            ('vectorizer2', TfidfVectorizer(stop_words=stops, 
                                                                            max_features=100,
                                                                            max_df=0.8))]))])),
                        # use the processed columns 'question1' and 'question2' to train a model
                        ('clf', RandomForestClassifier(100, n_jobs=-1))])

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(data.ix[:,1:-1], data.ix[:,-1], test_size=0.3)

In [131]:
%%time
# train model using the pipeline
rf_pipeline.fit(X_train, y_train)

CPU times: user 32min 34s, sys: 12.3 s, total: 32min 46s
Wall time: 9min 3s


Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('process_q1', Pipeline(steps=[('get_q1', FunctionTransformer(accept_sparse=False,
          func=<function get_question at 0x13b6f2ea0>, inv_kw_args=None,
          inverse_func=None, kw_args={'column': 'question1'}, pass_y=False,
   ...ators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [11]:
from sklearn.metrics import log_loss

In [133]:
# calculate logloss using the test set predictions by the pipeline object
log_loss(y_test, rf_pipeline.predict_proba(X_test))

0.64762410624341504

In [12]:
from sklearn.decomposition import NMF

In [146]:
# pipeline to process data + classifiction training
rf_pipeline = Pipeline([
                # first process question1 and question2 independently
                ('union', FeatureUnion([
                            # process question1
                            ('process_q1', Pipeline([
                                            # get data from column 'question1'
                                            ('get_q1', FunctionTransformer(get_question,
                                                                           kw_args={'column':'question1'},
                                                                           validate=False)),
                                            # CountVectorizer + TfIdf with TfidfVectorizer
                                            # stop_words from nltk data
                                            # and max_df to remove any repeating patterns that are not
                                            # captured by stop_words
                                            ('vectorizer1', CountVectorizer(stop_words=stops, 
                                                                            max_df=0.8)),
                                            # instead of deciding which features to keep let's
                                            # try and keep first 100 components using NMF
                                            ('nmf_1', NMF(100))])),
                            # do the same with 'question2' column
                            ('process_q2', Pipeline([
                                            ('get_q2', FunctionTransformer(get_question,
                                                                           kw_args={'column':'question2'},
                                                                           validate=False)),
                                            ('vectorizer2', CountVectorizer(stop_words=stops, 
                                                                            max_df=0.8)),
                                            ('nmf_2', NMF(100))]))],
                                         n_jobs=-1)),
                # use the processed columns 'question1' and 'question2' to train a model
                ('clf', RandomForestClassifier(100, n_jobs=-1))])

In [147]:
%%time
# train model using the pipeline
rf_pipeline.fit(X_train, y_train)

CPU times: user 10min 2s, sys: 12.2 s, total: 10min 14s
Wall time: 14min 12s


Pipeline(steps=[('union', FeatureUnion(n_jobs=-1,
       transformer_list=[('process_q1', Pipeline(steps=[('get_q1', FunctionTransformer(accept_sparse=False,
          func=<function get_question at 0x13b6f2ea0>, inv_kw_args=None,
          inverse_func=None, kw_args={'column': 'question1'}, pass_y=False,
  ...ators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [148]:
# calculate logloss using the test set predictions by the pipeline object
log_loss(y_test, rf_pipeline.predict_proba(X_test))

0.47304461401597586

In [150]:
# pipeline to process data + classifiction training
gb_pipeline = Pipeline([
                # first process question1 and question2 independently
                ('union', FeatureUnion([
                            # process question1
                            ('process_q1', Pipeline([
                                            # get data from column 'question1'
                                            ('get_q1', FunctionTransformer(get_question,
                                                                           kw_args={'column':'question1'},
                                                                           validate=False)),
                                            # CountVectorizer + TfIdf with TfidfVectorizer
                                            # stop_words from nltk data
                                            # and max_df to remove any repeating patterns that are not
                                            # captured by stop_words
                                            ('vectorizer1', CountVectorizer(stop_words=stops, 
                                                                            max_df=0.8)),
                                            # instead of deciding which features to keep let's
                                            # try and keep first 100 components using NMF
                                            ('dim_red_1', NMF(100))])),
                            # do the same with 'question2' column
                            ('process_q2', Pipeline([
                                            ('get_q2', FunctionTransformer(get_question,
                                                                           kw_args={'column':'question2'},
                                                                           validate=False)),
                                            ('vectorizer2', CountVectorizer(stop_words=stops, 
                                                                            max_df=0.8)),
                                            ('dim_red_2', NMF(100))]))],
                                         n_jobs=-1)),
                # use the processed columns 'question1' and 'question2' to train a model
                ('clf', GradientBoostingClassifier(n_estimators=100))])

In [151]:
%%time
# train model using the pipeline
gb_pipeline.fit(X_train, y_train)

CPU times: user 8min 35s, sys: 12.6 s, total: 8min 48s
Wall time: 37min 50s


Pipeline(steps=[('union', FeatureUnion(n_jobs=-1,
       transformer_list=[('process_q1', Pipeline(steps=[('get_q1', FunctionTransformer(accept_sparse=False,
          func=<function get_question at 0x13b6f2ea0>, inv_kw_args=None,
          inverse_func=None, kw_args={'column': 'question1'}, pass_y=False,
  ...=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False))])

In [152]:
# calculate logloss using the test set predictions by the pipeline object
log_loss(y_test, gb_pipeline.predict_proba(X_test))

0.58641317946014271

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
def get_cos_sim(X):
    return cosine_similarity(X[0], X[1]).diagonal().reshape(-1,1)

In [15]:
def process_questions_func(X,stops):
    vectorizer = CountVectorizer(stop_words=stops)
    vectorizer.fit(X['question1'])
    return (vectorizer.fit_transform(X['question1']), vectorizer.transform(X['question2']))

In [16]:
def debug_shape(X):
    print(X.shape)
    return X

In [17]:
# pipeline to process data + classifiction training
rf_pipeline = Pipeline([('preprocessing', FeatureUnion([('cos_sim', Pipeline([('process_questions', FunctionTransformer(process_questions_func,
                                                                                                            kw_args={'stops':stops},
                                                                                                            validate=False)),
                                                                   ('cosine_sim', FunctionTransformer(get_cos_sim,validate=False))])),
                                                                    # process question1 and question2 independently for NMF
                                                        ('union', FeatureUnion([
                                                                        # process question1
                                                                        ('process_q1', Pipeline([
                                                                                        # get data from column 'question1'
                                                                                        ('get_q1', FunctionTransformer(get_question,
                                                                                                                       kw_args={'column':'question1'},
                                                                                                                       validate=False)),
                                                                                        # CountVectorizer + TfIdf with TfidfVectorizer
                                                                                        # stop_words from nltk data
                                                                                        # and max_df to remove any repeating patterns that are not
                                                                                        # captured by stop_words
                                                                                        ('vectorizer1', CountVectorizer(stop_words=stops, 
                                                                                                                        max_df=0.8)),
                                                                                         # instead of deciding which features to keep let's
                                                                                        # try and keep first 100 components using NMF
                                                                                        ('dim_red', NMF(100))
                                                                                        ])),

                                                                        # do the same with 'question2' column
                                                                        ('process_q2', Pipeline([
                                                                                        ('get_q2', FunctionTransformer(get_question,
                                                                                                                       kw_args={'column':'question2'},
                                                                                                                       validate=False)),
                                                                                        ('vectorizer2', CountVectorizer(stop_words=stops, 
                                                                                                                        max_df=0.8)),
                                                                                        ('dim_red', NMF(100))
                                                                        ]))]))])),
                        # use the processed columns 'question1' and 'question2' to train a model
                        ('clf', RandomForestClassifier(100, n_jobs=-1))])

In [None]:
%%time
# train model using the pipeline
rf_pipeline.fit(X_train, y_train)

In [None]:
%%time
# calculate logloss using the test set predictions by the pipeline object
log_loss(y_test, rf_pipeline.predict_proba(X_test))