In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn import linear_model
import random
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,\
                             RandomForestRegressor, AdaBoostRegressor
import itertools
from time import time
import os

train_filename = "train.csv"
test_filename = "test.csv"

def load_data(fname):
    df = pd.DataFrame.from_csv(fname, index_col=None)    
    return df    

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', 65)  # default: 50

import pickle
base_path = r'C:\Users\Dean\Desktop\Kaggle Quora Challenge\Checkpoints'

def save_checkpoint(train, test, checkpoint):
    with open(os.path.join(base_path, 'checkpoint_'+checkpoint+'_train.pkl'), 'wb') as f:
        pickle.dump(train, f)
    with open(os.path.join(base_path, 'checkpoint_'+checkpoint+'_test.pkl'), 'wb') as f:
        pickle.dump(test, f)

def load_checkpoint(checkpoint):
    with open(os.path.join(base_path, 'checkpoint_'+checkpoint+'_train.pkl'), 'rb') as f:
        train = pickle.load(f)
    with open(os.path.join(base_path, 'checkpoint_'+checkpoint+'_test.pkl'), 'rb') as f:
        test = pickle.load(f)
    return train, test
        
%matplotlib inline

In [2]:
def contains_weird_digit(x):
    for a in x:
        if not (str.isalnum(a) or str.isspace(a) or a in '!?.,;-()\'":/\\$+=#@%&'):
            return True
    return False
#train[train.question1.apply(contains_weird_digit)].shape
#train.question1[:10].apply(lambda q: filter(lambda x:str.isalnum(x) or str.isspace(x), q))

In [3]:
start = time()
print 'Loading challenge...',
train = load_data(train_filename)
train = train.dropna()
test = load_data(test_filename)
test.iloc[np.where(test.isnull())]='why'
print 'done.'
end = time()
print '{:.2f} seconds'.format(end-start)

Loading challenge... done.
11.34 seconds


In [4]:
# train = train.iloc[:4000]
# test = test.iloc[:20000]

In [5]:
# start = time()
# save_checkpoint(train, test, checkpoint='AfterLoading')
# end = time()
# print '{:.2f} seconds'.format(end-start)

In [6]:
# start = time()
# train, test = load_checkpoint('AfterLoading')
# end = time()
# print '{:.2f} seconds'.format(end-start)

In [7]:
start = time()
train.question1 = train.question1.apply(lambda x: x.lower())
train.question2 = train.question2.apply(lambda x: x.lower())
test.question1 = test.question1.apply(lambda x: x.lower())
test.question2 = test.question2.apply(lambda x: x.lower())
end = time()
print '{:.2f} seconds'.format(end-start)

4.05 seconds


In [8]:
start = time()
train.question1[train.question1.apply(contains_weird_digit)] = np.nan
train.question1[train.question2.apply(contains_weird_digit)] = np.nan
train.dropna(inplace=True)
end = time()
print '{:.2f} seconds'.format(end-start)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


12.59 seconds


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [9]:
start = time()
words_lst_iterator1 = itertools.chain(*[''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train.question1])
words_lst_iterator2 = itertools.chain(*[''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train.question2])

from collections import Counter
c = Counter(itertools.chain(words_lst_iterator1, words_lst_iterator2))

with open(os.path.join(base_path, 'Counter.pkl'), 'wb') as f:
    pickle.dump(c, f)

end = time()
print '{:.2f} seconds'.format(end-start)

23.28 seconds


In [10]:
start = time()
words = sorted(c.keys(), key=lambda x:c[x], reverse=True)
freq = [c[w] for w in words]
end = time()
print '{:.2f} seconds'.format(end-start)
# plt.plot(freq[:100])
# plt.xlabel('word num')
# plt.ylabel('num occurences')

0.16 seconds


In [11]:
def intersect(x, y):
    if type(x) is not set:
        x = set(x)
    return set(filter(x.__contains__, y))

def union(x,y):
    return set(x).union(y)

def word_weight(w):
    return 1

def word_weight2(w):
    return 1./(c[w]+1)

def word_weight3(w):
    return 1./np.sqrt(c[w]+1)

def word_weight4(w):
    return 1./(c[w]+1)**2

In [12]:
uncommon_words_set = set(words[100:])

def similarity1(lst1, lst2, weight_fun = word_weight):
    return (1 + sum(weight_fun(w) for w in intersect(lst1, lst2))) *1./ (1 + sum(weight_fun(w) for w in union(lst1, lst2)))

def similarity2(lst1, lst2, weight_fun = word_weight):
    return  (1 + sum(weight_fun(w) for w in intersect(uncommon_words_set, intersect(lst1, lst2)))) *1./\
            (1 + sum(weight_fun(w) for w in intersect(uncommon_words_set, union(lst1, lst2))))
    
def similarity3(lst1, lst2):
    return similarity1(lst1, lst2, weight_fun=word_weight2)

def similarity4(lst1, lst2):
    return similarity1(lst1, lst2, weight_fun=word_weight3)

def similarity5(lst1, lst2):
    return similarity1(lst1, lst2, weight_fun=word_weight4)

In [13]:
start = time()
for n in [1,2]:
    train['question{}_words'.format(n)] = [''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train['question{}'.format(n)]]
    test['question{}_words'.format(n)] = [''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in test['question{}'.format(n)]]
end = time()
print '{:.2f} seconds'.format(end-start)

175.22 seconds


In [14]:
start = time()
sim_fun_lst = [similarity1, similarity2, similarity3, similarity4, similarity5]
for sim_fun in sim_fun_lst:
    train[sim_fun.__name__] = [sim_fun(lst1, lst2) for lst1,lst2 in itertools.izip(train.question1_words, train.question2_words)]
    test[sim_fun.__name__] = [sim_fun(lst1, lst2) for lst1,lst2 in itertools.izip(test.question1_words, test.question2_words)]
end = time()
print '{:.2f} seconds'.format(end-start)

550.28 seconds


In [15]:
def indicator1(x):
    return 'how can you' in x or 'how can i' in x or 'how do i' in x or 'how do you' in x

start = time()

train['question1_ind1'] = train.question1.apply(indicator1)
train['question2_ind1'] = train.question2.apply(indicator1)

test['question1_ind1'] = test.question1.apply(indicator1)
test['question2_ind1'] = test.question2.apply(indicator1)

end = time()
print '{:.2f} seconds'.format(end-start)

10.53 seconds


In [16]:
start = time()

for n in [1,2]:
    train['len{}'.format(n)] = train['question{}_words'.format(n)].apply(len)
    test['len{}'.format(n)] = test['question{}_words'.format(n)].apply(len)
end = time()
print '{:.2f} seconds'.format(end-start)

15.83 seconds


In [17]:
start = time()
save_checkpoint(train, test, checkpoint='AfterPreprocess')
end = time()
print '{:.2f} seconds'.format(end-start)

MemoryError: 

In [18]:
columns = []
columns += ['similarity{}'.format(k) for k in [1,2,3,4,5]]
columns += ['question{}_ind{}'.format(n,i) for n in [1,2] for i in [1]]
columns += ['len{}'.format(n) for n in [1,2]]
# columns += ['logreg_out']
print columns

['similarity1', 'similarity2', 'similarity3', 'similarity4', 'similarity5', 'question1_ind1', 'question2_ind1', 'len1', 'len2']


In [19]:
start = time()

print 'columns: {}'.format(columns)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(verbose=1, n_estimators=70, learning_rate=0.1, max_depth=3, min_samples_leaf=1000)
clf.fit(train[columns], train.is_duplicate)

end = time()
print '{:.2f} sec'.format(end-start)

columns: ['similarity1', 'similarity2', 'similarity3', 'similarity4', 'similarity5', 'question1_ind1', 'question2_ind1', 'len1', 'len2']
      Iter       Train Loss   Remaining Time 
         1           1.2772            1.00m
         2           1.2413           52.67s
         3           1.2111           54.56s
         4           1.1858           52.68s
         5           1.1635           50.75s
         6           1.1446           48.95s
         7           1.1282           48.79s
         8           1.1136           47.41s
         9           1.1010           46.18s
        10           1.0898           45.20s
        20           1.0265           37.52s
        30           1.0032           29.44s
        40           0.9926           21.70s
        50           0.9868           14.02s
        60           0.9824            6.87s
        70           0.9794            0.00s
54.19 sec


In [20]:
start = time()
with open(os.path.join(base_path, 'FinalModel.pkl'), 'wb') as f:
    pickle.dump(clf, f)
end = time()
print '{:.2f} sec'.format(end-start)

0.07 sec


In [None]:
for name, score in zip(columns, clf.feature_importances_):
    print '{:20}: {}'.format(name, score)

similarity1         : 0.347919685689
similarity2         : 0.208975727727
similarity3         : 0.103720615656
similarity4         : 0.119783431697
similarity5         : 0.0207014918642
question1_ind1      : 0.00632612841156
question2_ind1      : 0.0283943636783
len1                : 0.0814608772708
len2                : 0.0827176780063


In [None]:
test['is_duplicate'] = clf.predict_proba(test[columns])[:,1]

In [None]:
test['is_duplicate'].head()

In [None]:
a = 0.165 / 0.37
b = (1 - 0.165) / (1 - 0.37)
test['is_duplicate'] = test['is_duplicate'].apply(lambda x:a * x / (a * x + b * (1 - x)))

In [None]:
test['is_duplicate'].head()

In [None]:
test[['test_id','is_duplicate']].to_csv(os.path.join(base_path, 'my_submission.csv'), index=False, compression='gzip'))