In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import os
import eli5
from sklearn.metrics import log_loss, roc_auc_score



In [2]:
SEED = 42

# Load leaks

In [3]:
LEAKS_DIR = './data/'

In [4]:
leaks_train = pd.read_csv(os.path.join(LEAKS_DIR, 'leak_train.csv'), index_col=0).add_prefix('LEAK_')
leaks_test = pd.read_csv(os.path.join(LEAKS_DIR, 'leak_test.csv'), index_col=0).add_prefix('LEAK_')

# Load data

In [5]:
train = pd.read_pickle('./cv/f0/train_features.pkl')
test = pd.read_pickle('./cv/f0/test_features.pkl')

In [6]:
train = pd.merge(train, leaks_train, how='left', left_index=True, right_index=True)
test = pd.merge(test, leaks_test, how='left', left_index=True, right_index=True)

In [7]:
train.shape, test.shape

((404288, 82), (2345790, 82))

In [8]:
FEATURES_ALL, TARGET = train.columns.difference(['TARGET']), 'TARGET'

In [9]:
FEATURES = FEATURES_ALL.difference(['LEAK_co_occur_count', 'LEAK_max_freq', 'LEAK_min_freq'])
#FEATURES = FEATURES_ALL

# Preprocess data

In [10]:
train[FEATURES] = train[FEATURES].fillna(-100.0).astype('float')
test[FEATURES] = test[FEATURES].fillna(-100.0).astype('float')

# UpDownSampling

In [11]:
pos_rows = train[train[TARGET] == 1]
neg_rows = train[train[TARGET] == 0]
train_ud = pd.concat([neg_rows, pos_rows.sample(frac=0.8, random_state=SEED), neg_rows])

# Train model

In [11]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [37]:
model = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_leaf=1,
                               max_features='auto', n_jobs=16, random_state=SEED)

In [12]:
model = XGBClassifier(max_depth=7, learning_rate=0.02, n_estimators=2500, subsample=0.6, base_score=0.2,
                      seed=SEED, missing=-100.0)

In [13]:
%%time
model.fit(train[FEATURES], train[TARGET])
#model.fit(train_ud[FEATURES], train_ud[TARGET])

CPU times: user 6h 21min 9s, sys: 20.6 s, total: 6h 21min 30s
Wall time: 31min 21s


XGBClassifier(base_score=0.2, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=-100.0, n_estimators=2500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.6)

In [14]:
eli5.show_weights(model, feature_names=FEATURES.tolist())

Weight,Feature
0.2311,VW_QUAD_BIGRAM_pred
0.0708,JAC_sim
0.0471,BOW_TFIDF_sim
0.0457,RAW_tok_set_ratio
0.0303,VW_QUAD_pred
0.0263,RAW_loc_jaccard
0.0196,RAW_loc_agree
0.0143,RAW_agree_digit
0.0141,RAW_simple_ratio
0.0127,RAW_tok_sort_ratio


# Predict

In [15]:
pred = pd.Series(index=test.index, data=model.predict_proba(test[FEATURES])[:, 1])

In [16]:
a = 0.165 / 0.37
b = (1 - 0.165) / (1 - 0.37)

def reweigh(x):
    return a * x / (a * x + b * (1 - x))

def reweigh_alt(x):
    s = 0.3627
    return s * x / (s * x + (1 - x))

In [17]:
all_pred = pd.Series(np.nan, index=np.arange(2345796))

In [18]:
print(len(pred), len(all_pred))

2345790 2345796


In [19]:
#all_pred.loc[pred.index] = reweigh_alt(pred)
all_pred.loc[pred.index] = reweigh(pred)
#all_pred.loc[pred.index] = pred

In [20]:
all_pred.isnull().sum()

6

In [21]:
#all_pred.fillna(0.0, inplace=True)
all_pred.fillna(0.17426442474, inplace=True)

In [22]:
len(all_pred)

2345796

In [23]:
all_pred.index.name = 'test_id'
all_pred.name = 'is_duplicate'

In [24]:
all_pred.to_csv('./submission.csv', header=True)

In [25]:
!head submission.csv

test_id,is_duplicate
0,0.0006135582807473838
1,0.15079011023044586
2,0.13276518881320953
3,0.0007430342957377434
4,0.17351564764976501
5,0.0009033497772179544
6,0.21100372076034546
7,0.9900713562965393
8,0.2147016078233719
