In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('./train.csv')
# remove NaN values in question2 column
data = data.ix[data.question2.notnull()]

In [3]:
from nltk.corpus import stopwords
# get english stopwords from nltk package
stops = set(stopwords.words("english"))

In [4]:
# import useful libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [5]:
from sklearn.model_selection import train_test_split
# split data 
X_train, X_test, y_train, y_test = train_test_split(data.ix[:,1:-1], data.ix[:,-1], test_size=0.3)

In [6]:
from sklearn.metrics import log_loss
from sklearn.decomposition import NMF

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def get_cos_sim(X):
    return cosine_similarity(X[0], X[1]).diagonal().reshape(-1,1)

In [9]:
def process_questions_func(X,stops):
    vectorizer = CountVectorizer(stop_words=stops)
    vectorizer.fit(X['question1'])
    return (vectorizer.fit_transform(X['question1']), vectorizer.transform(X['question2']))

In [10]:
def debug_shape(X):
    print(X.shape)
    return X

In [12]:
# function to return data of specific column
def get_question(data, column):
    return data.loc[:,column]

In [13]:
# pipeline to process data + classification training
rf_pipeline = Pipeline([('preprocessing', FeatureUnion([('cos_sim', Pipeline([('process_questions', FunctionTransformer(process_questions_func,
                                                                                                            kw_args={'stops':stops},
                                                                                                            validate=False)),
                                                                   ('cosine_sim', FunctionTransformer(get_cos_sim,validate=False))])),
                                                                    # process question1 and question2 independently for NMF
                                                        ('union', FeatureUnion([
                                                                        # process question1
                                                                        ('process_q1', Pipeline([
                                                                                        # get data from column 'question1'
                                                                                        ('get_q1', FunctionTransformer(get_question,
                                                                                                                       kw_args={'column':'question1'},
                                                                                                                       validate=False)),
                                                                                        # CountVectorizer + TfIdf with TfidfVectorizer
                                                                                        # stop_words from nltk data
                                                                                        # and max_df to remove any repeating patterns that are not
                                                                                        # captured by stop_words
                                                                                        ('vectorizer1', CountVectorizer(stop_words=stops, 
                                                                                                                        max_df=0.8)),
                                                                                         # instead of deciding which features to keep let's
                                                                                        # try and keep first 100 components using NMF
                                                                                        ('dim_red', NMF(100))
                                                                                        ])),

                                                                        # do the same with 'question2' column
                                                                        ('process_q2', Pipeline([
                                                                                        ('get_q2', FunctionTransformer(get_question,
                                                                                                                       kw_args={'column':'question2'},
                                                                                                                       validate=False)),
                                                                                        ('vectorizer2', CountVectorizer(stop_words=stops, 
                                                                                                                        max_df=0.8)),
                                                                                        ('dim_red', NMF(100))
                                                                        ]))]))])),
                        # use the processed columns 'question1' and 'question2' to train a model
                        ('clf', RandomForestClassifier(100, n_jobs=-1))])

In [None]:
%%time
# train model using the pipeline
rf_pipeline.fit(X_train, y_train)

In [None]:
%%time
# calculate logloss using the test set predictions by the pipeline object
log_loss(y_test, rf_pipeline.predict_proba(X_test))