In [6]:
%pylab inline
import pandas as pd
from lightgbm import LGBMClassifier, plot_metric, plot_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

Populating the interactive namespace from numpy and matplotlib


In [32]:
%%time
data = pd.read_csv('champ_10_datas/impressions.сsv', 
    parse_dates=['event_datetime_m'], infer_datetime_format=True)

grouped = data.groupby(['id_show', 'id_user']).size()
bot = grouped[grouped > 10000].index.get_level_values('id_user')[0]
data = data[data['id_user'] != bot]

data = data.merge(grouped[grouped > 380].reset_index(),
    how='left', on=['id_show', 'id_user'])
data = data[data[0].isnull()].drop(0, axis=1)

bot = data.groupby(['id_user', 'id_show', 'event_datetime_m']).size().idxmax()
data = data[~((data['id_user'] == bot[0]) & 
    (data['id_show'] == bot[1]) & (data['event_datetime_m'] == bot[2]))]

data = pd.read_csv('champ_10_datas/test.csv', index_col='id',
    parse_dates=['event_datetime_m'], infer_datetime_format=True).append(data)

data.reset_index(drop=True, inplace=True)

data['event_hour'] = data['event_datetime_m'].dt.hour
data['event_day'] = data['event_datetime_m'].dt.dayofweek

data['order'] = np.ones(len(data))
desc = data.sort_values('event_datetime_m', ascending=False).groupby(
    ['id_user', 'id_show'])[['order']].cumsum().rename(columns={'order':'desc'})
asc = data.sort_values('event_datetime_m', ascending=True).groupby(
    ['id_user', 'id_show'])[['order']].cumsum().rename(columns={'order':'asc'})
data = data.merge(desc, left_index=True, right_index=True).merge(
    asc, left_index=True, right_index=True).drop('order', axis=1)

grouped = data.groupby(['id_user', 'id_show'])[['event_datetime_m']].max()
merged = data.merge(grouped, left_on=['id_user', 'id_show'], right_index=True)
diff_time = merged[['event_datetime_m_x', 'event_datetime_m_y']].diff(axis=1)['event_datetime_m_y']
data['delta_event_time'] = diff_time.dt.seconds

one_minute = data.groupby(['id_user', 'id_show', 
    'event_datetime_m']).size().reset_index().rename(columns={0:'one_minute'})
data = data.merge(one_minute, how='left', on=['id_user', 'id_show', 'event_datetime_m'])

data = data.sort_values('event_datetime_m').drop('event_datetime_m', axis=1)

  mask |= (ar1 == a)


CPU times: user 52.5 s, sys: 2.35 s, total: 54.8 s
Wall time: 54.8 s


In [3]:
%%time
show_rating = pd.read_csv('champ_10_datas/show_rating.сsv', 
    parse_dates=['date_time'], infer_datetime_format=True)
date = show_rating['date_time']
show_rating['date_time'] = (date.max() - date).dt.days
show_rating = pd.merge(
    show_rating.groupby('id_show').mean(),
    show_rating.groupby('id_show').std(),
    left_index=True, right_index=True, 
    suffixes=('_mean', '_std'))

CPU times: user 52.2 s, sys: 496 ms, total: 52.7 s
Wall time: 52.6 s


In [4]:
%%time
client_data = pd.read_csv('champ_10_datas/client_data.сsv', index_col='id_user',
        parse_dates=['create_datetime'], infer_datetime_format=True)
date = client_data['create_datetime']
client_data['create_datetime'] = (date.max() - date).dt.days
client_data.loc[client_data['sex'] == 'male', 'sex'] = 1.0
client_data.loc[client_data['sex'] == 'female', 'sex'] = 0.0
client_data['sex'] = client_data['sex'].astype('float')
client_data.loc[(client_data['age'] < 16) | (client_data['age'] > 80), 'age'] = None

CPU times: user 1.39 s, sys: 12 ms, total: 1.4 s
Wall time: 1.4 s


In [5]:
%%time
show_data = pd.read_csv('champ_10_datas/show_data.сsv')
show_data.drop(['parent_genre_id', 'child_genre_id', 'organizer_id'], inplace=True, axis=1)
buildings = show_data.groupby('id_show')['IdBuilding'].size()
show_data = show_data.groupby('id_show').first()
show_data['IdBuilding'] = buildings
show_data.rename(columns={'IdBuilding':'buildings'}, inplace=True)

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 41.3 ms


In [19]:
%%time
no_impressions = pd.read_csv('champ_10_datas/clicks_no_impressions.сsv',
    usecols=['id_show', 'id_user']).drop_duplicates()
no_impressions['no_impressions'] = np.ones(len(no_impressions))

CPU times: user 11.5 s, sys: 648 ms, total: 12.1 s
Wall time: 12.1 s


In [33]:
%%time
data = data.merge(client_data, how='left', left_on='id_user', right_index=True).merge(
    show_data, how='left', left_on='id_show', right_index=True).merge(
    show_rating, how='left', left_on='id_show', right_index=True).merge(
    no_impressions, how='left', on=['id_user', 'id_show']).drop(
    ['id_show', 'id_user'], axis=1)
data['no_impressions'].fillna(0, inplace=True)

CPU times: user 4.49 s, sys: 1.2 s, total: 5.68 s
Wall time: 5.68 s


In [59]:
%%time
train_labels = data.loc[data['is_clicked'].notnull(), 'is_clicked']
train_objects = data.loc[data['is_clicked'].notnull()].drop('is_clicked', axis=1)
test_objects = data.loc[data['is_clicked'].isnull()].drop(['is_clicked'], axis=1)

CPU times: user 524 ms, sys: 292 ms, total: 816 ms
Wall time: 814 ms


In [66]:
m = len(train_objects)
val_train_objects, val_train_labels = train_objects[:int(0.8*m)], train_labels[:int(0.8*m)]
val_test_objects, val_test_labels = train_objects[int(0.8*m):], train_labels[int(0.8*m):]

In [76]:
%%time
model = LGBMClassifier(n_estimators=200, max_depth=4)
model.fit(val_train_objects, val_train_labels, eval_set=(val_test_objects, val_test_labels))

[1]	valid_0's binary_logloss: 0.60934
[2]	valid_0's binary_logloss: 0.540699
[3]	valid_0's binary_logloss: 0.483551
[4]	valid_0's binary_logloss: 0.435295
[5]	valid_0's binary_logloss: 0.394197
[6]	valid_0's binary_logloss: 0.35887
[7]	valid_0's binary_logloss: 0.328344
[8]	valid_0's binary_logloss: 0.301802
[9]	valid_0's binary_logloss: 0.27863
[10]	valid_0's binary_logloss: 0.258334
[11]	valid_0's binary_logloss: 0.240485
[12]	valid_0's binary_logloss: 0.224759
[13]	valid_0's binary_logloss: 0.21087
[14]	valid_0's binary_logloss: 0.198573
[15]	valid_0's binary_logloss: 0.187673
[16]	valid_0's binary_logloss: 0.177983
[17]	valid_0's binary_logloss: 0.169376
[18]	valid_0's binary_logloss: 0.16171
[19]	valid_0's binary_logloss: 0.154874
[20]	valid_0's binary_logloss: 0.148764
[21]	valid_0's binary_logloss: 0.143317
[22]	valid_0's binary_logloss: 0.138417
[23]	valid_0's binary_logloss: 0.134035
[24]	valid_0's binary_logloss: 0.130143
[25]	valid_0's binary_logloss: 0.126624
[26]	valid_0's

In [78]:
%%time
model = LGBMClassifier(n_estimators=200, max_depth=4)
model.fit(train_objects, train_labels)

CPU times: user 3min 48s, sys: 776 ms, total: 3min 49s
Wall time: 33.7 s


In [79]:
importances = list()
for j, name in enumerate(train_objects.columns):
    idxs = train_objects[name].notnull()
    auc = roc_auc_score(train_labels[idxs], train_objects.loc[idxs, name])
    cor = np.corrcoef(train_labels[idxs], train_objects.loc[idxs, name])[0, 1] * 100
    importances.append((name, model.feature_importances_[j], abs(auc - 0.5) * 100, cor))
importances = pd.DataFrame(np.array(importances)[:, 1:], np.array(importances)[:, 0])
importances.rename(columns={0: 'splits', 1:'auc', 2:'cor'}, inplace=True)
importances.sort_values(['auc', 'cor', 'splits'], inplace=True)
importances.to_csv('importances.csv')

In [81]:
%%time
pred_labels = model.predict_proba(test_objects)[:, 1]
submit = pd.DataFrame(pred_labels).rename(columns={0:'_VAL_'})
submit.index.name = '_ID_'

CPU times: user 13 s, sys: 40 ms, total: 13.1 s
Wall time: 1.96 s


In [83]:
submit.to_csv('submit.csv')