In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [29]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

# Frequencies

In [30]:
%%time
questions = pd.concat([train.question1, train.question2, test.question1, test.question2], ignore_index=True)

CPU times: user 92 ms, sys: 40 ms, total: 132 ms
Wall time: 131 ms


In [31]:
%%time
freq = questions.value_counts(normalize=False, dropna=False)

CPU times: user 8.6 s, sys: 364 ms, total: 8.96 s
Wall time: 8.95 s


In [41]:
%%time
train['q1_freq'] = train.question1.map(freq)
train['q2_freq'] = train.question2.map(freq)
test['q1_freq'] = test.question1.map(freq)
test['q2_freq'] = test.question2.map(freq)

CPU times: user 3.02 s, sys: 16 ms, total: 3.04 s
Wall time: 3.06 s


In [49]:
leaks_train = pd.DataFrame(index=train.index)
leaks_test = pd.DataFrame(index=test.index)

In [50]:
leaks_train['min_freq'] = train[['q1_freq', 'q2_freq']].min(axis=1)
leaks_train['max_freq'] = train[['q1_freq', 'q2_freq']].max(axis=1)
leaks_test['min_freq'] = test[['q1_freq', 'q2_freq']].min(axis=1)
leaks_test['max_freq'] = test[['q1_freq', 'q2_freq']].max(axis=1)

# Co-occurence

In [55]:
pairs = pd.concat([train[['question1', 'question2']], test[['question1', 'question2']]], ignore_index=True)

In [58]:
from collections import defaultdict

In [68]:
%%time
q_dict = defaultdict(set)
for q1, q2 in pairs.itertuples(index=False):
    q_dict[q1].add(q2)
    q_dict[q2].add(q1)

CPU times: user 19.1 s, sys: 1.25 s, total: 20.4 s
Wall time: 20.3 s


In [70]:
def co_occur_count(row):
    return len(q_dict[row['question1']] & q_dict[row['question2']])

In [76]:
%%time
leaks_train['co_occur_count'] = train.apply(co_occur_count, axis=1)

CPU times: user 21.2 s, sys: 40 ms, total: 21.2 s
Wall time: 21.2 s


In [77]:
%%time
leaks_test['co_occur_count'] = test.apply(co_occur_count, axis=1)

CPU times: user 1min 59s, sys: 108 ms, total: 1min 59s
Wall time: 1min 59s


# Save

In [82]:
leaks_train.to_csv('data/leak_train.csv')

In [83]:
leaks_test.to_csv('data/leak_test.csv')