In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn import linear_model
import random
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,\
                             RandomForestRegressor, AdaBoostRegressor
import itertools
from time import time

train_filename = "train.csv"
#test_filename = "test.csv"

def load_data(fname):
    df = pd.DataFrame.from_csv(fname, index_col=None)    
    return df    

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', 65)  # default: 50

%matplotlib inline

In [2]:
def contains_weird_digit(x):
    for a in x:
        if not (str.isalnum(a) or str.isspace(a) or a in '!?.,;-()\'":/\\$+=#@%&'):
            return True
    return False
#train[train.question1.apply(contains_weird_digit)].shape
#train.question1[:10].apply(lambda q: filter(lambda x:str.isalnum(x) or str.isspace(x), q))

In [3]:
print 'Loading challenge...',
train = load_data(train_filename)
train = train.dropna()
# test = load_data(test_filename)
# test.iloc[np.where(test.isnull())]='why'
print 'done.'

Loading challenge... done.


In [4]:
train.question1 = train.question1.apply(lambda x: x.lower())
train.question2 = train.question2.apply(lambda x: x.lower())
# test.question1 = test.question1.apply(lambda x: x.lower())
# test.question2 = test.question2.apply(lambda x: x.lower())

In [5]:
train.loc[train.question1.apply(contains_weird_digit) , "question1"] = np.nan
train.loc[train.question2.apply(contains_weird_digit) , "question1"] = np.nan
train.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [6]:
start = time()
words_lst_iterator1 = itertools.chain(*[''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train.question1])
words_lst_iterator2 = itertools.chain(*[''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train.question2])

from collections import Counter
c = Counter(itertools.chain(words_lst_iterator1, words_lst_iterator2))
end = time()
print '{:.2f} seconds'.format(end-start)

24.35 seconds


In [7]:
words = sorted(c.keys(), key=lambda x:c[x], reverse=True)
freq = [c[w] for w in words]

# plt.plot(freq[:100])
# plt.xlabel('word num')
# plt.ylabel('num occurences')

In [8]:
def intersect(x, y):
    if type(x) is not set:
        x = set(x)
    return set(filter(x.__contains__, y))

def union(x,y):
    return set(x).union(y)

def word_weight(w):
    return 1

def word_weight2(w):
    return 1./(c[w]+1)

def word_weight3(w):
    return 1./np.sqrt(c[w]+1)

def word_weight4(w):
    return 1./(c[w]+1)**2

In [9]:
uncommon_words_set = set(words[100:])

def similarity1(lst1, lst2, weight_fun = word_weight):
    return (1 + sum(weight_fun(w) for w in intersect(lst1, lst2))) *1./ (1 + sum(weight_fun(w) for w in union(lst1, lst2)))

def similarity2(lst1, lst2, weight_fun = word_weight):
    return  (1 + sum(weight_fun(w) for w in intersect(uncommon_words_set, intersect(lst1, lst2)))) *1./\
            (1 + sum(weight_fun(w) for w in intersect(uncommon_words_set, union(lst1, lst2))))
    
def similarity3(lst1, lst2):
    return similarity1(lst1, lst2, weight_fun=word_weight2)

def similarity4(lst1, lst2):
    return similarity1(lst1, lst2, weight_fun=word_weight3)

def similarity5(lst1, lst2):
    return similarity1(lst1, lst2, weight_fun=word_weight4)

In [10]:
train2 = train.iloc[:]

In [11]:
start = time()
for n in [1,2]:
    train2['question{}_words'.format(n)] = [''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train2['question{}'.format(n)]]
#     test['question{}_words'.format(n)] = [''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in test['question{}'.format(n)]]
end = time()
print '{:.2f} seconds'.format(end-start)

20.77 seconds


In [12]:
start = time()
sim_fun_lst = [similarity1, similarity2, similarity3, similarity4, similarity5]
for sim_fun in sim_fun_lst:
    train2[sim_fun.__name__] = [sim_fun(lst1, lst2) for lst1,lst2 in itertools.izip(train2.question1_words, train2.question2_words)]
end = time()
print '{:.2f} seconds'.format(end-start)

73.11 seconds


In [13]:
def indicator1(x):
    return 'how can you' in x or 'how can i' in x or 'how do i' in x or 'how do you' in x

train2['question1_ind1'] = train2.question1.apply(indicator1)
train2['question2_ind1'] = train2.question2.apply(indicator1)
# train2['ind1_prod'] = train2['question1_ind1']*train2['question2_ind1']

In [14]:
def indicator2(x):
    return 'india' in x
# print train2.question1.apply(indicator2).mean()
train2['question1_ind2'] = train2.question1.apply(indicator2)
train2['question2_ind2'] = train2.question2.apply(indicator2)

In [15]:
for n in [1,2]:
    train2['len{}'.format(n)] = train2['question{}_words'.format(n)].apply(len)

In [16]:
columns = []
columns += ['similarity{}'.format(k) for k in [1,2,3,4,5]]
columns += ['question{}_ind{}'.format(n,i) for n in [1,2] for i in [1]]
columns += ['len{}'.format(n) for n in [1,2]]
# columns += ['question{}_{}'.format(n,q) for n in [1,2] for q in q_words]
print columns

['similarity1', 'similarity2', 'similarity3', 'similarity4', 'similarity5', 'question1_ind1', 'question2_ind1', 'len1', 'len2']


In [17]:
train_size = train2.shape[0]/2
train_set = train2.iloc[:train_size]
valid_set = train2.iloc[train_size:]

In [18]:
def logloss(yhat, y):
    return -np.mean(y*np.log(yhat) + (1-y)*np.log(1-yhat))
def baseline_score(train_df, test_df):
    p = train_df.is_duplicate.mean()
    print 'p={}'.format(p)
    return logloss(np.array([p]*test_df.shape[0]), test_df.is_duplicate)

In [19]:
baseline_score(train_set, valid_set)

p=0.375830053795


0.6585011171362957

In [20]:
print 'columns: {}'.format(columns)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(verbose=1, n_estimators=70, learning_rate=0.1, max_depth=3, min_samples_leaf=1000)
clf.fit(train_set[columns], train_set.is_duplicate)

columns: ['similarity1', 'similarity2', 'similarity3', 'similarity4', 'similarity5', 'question1_ind1', 'question2_ind1', 'len1', 'len2']
      Iter       Train Loss   Remaining Time 
         1           1.2806            1.72m
         2           1.2448            1.05m
         3           1.2139           48.69s
         4           1.1884           40.87s
         5           1.1657           35.94s
         6           1.1465           32.64s
         7           1.1301           30.15s
         8           1.1156           28.33s
         9           1.1031           26.78s
        10           1.0921           25.49s
        20           1.0283           18.70s
        30           1.0057           14.31s
        40           0.9955           10.16s
        50           0.9890            6.51s
        60           0.9848            3.18s
        70           0.9813            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1000,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=70, presort='auto', random_state=None,
              subsample=1.0, verbose=1, warm_start=False)

In [21]:
print 'columns: {}'.format(columns)
yhat = clf.predict_proba(valid_set[columns])[:,1] + clf.predict_proba(valid_set[columns])[:,1]
print logloss(yhat, valid_set.is_duplicate)

columns: ['similarity1', 'similarity2', 'similarity3', 'similarity4', 'similarity5', 'question1_ind1', 'question2_ind1', 'len1', 'len2']
0.635343941035


  from ipykernel import kernelapp as app


In [22]:
for name, score in zip(columns, clf.feature_importances_):
    print '{:20}: {}'.format(name, score)

similarity1         : 0.348207951847
similarity2         : 0.228791734666
similarity3         : 0.122591582212
similarity4         : 0.0790651971588
similarity5         : 0.0152460397491
question1_ind1      : 0.00598714950562
question2_ind1      : 0.0208234836714
len1                : 0.0965417834897
len2                : 0.0827450777001
