In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
import datetime
import pprint
from math import sqrt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'
group=['ref_hash','window_nr']

---
## Read de Features

In [2]:
features = pd.read_csv('training_set.csv',low_memory=False).set_index(group)

In [3]:
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2564673204772915246,1,765.0,88625.0,216.0,0.0,0.0,False,3.0,0.0,0.0,False,198.0,0.0,0.0,7632.0,0.0,0.0,0.0
4441121667607578179,1,111.0,29052.0,1188.0,0.0,640.0,True,5.0,0.0,0.0,False,198.0,0.0,0.0,7632.0,0.0,0.0,0.0
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,198.0,0.0,0.0,7632.0,2.0,2.0,2.0
1258642015983312729,1,8365.0,121224.0,12.0,0.0,0.0,False,1.0,0.0,0.0,False,198.0,0.0,0.0,7632.0,0.0,0.0,0.0
6707090658317158573,1,453.0,4.0,516.0,35.0,171.0,True,4.0,0.0,0.0,False,198.0,0.0,0.0,7632.0,0.0,0.0,0.0


In [5]:
features.is_last_weekend.mean()

0.15368840452334914

In [6]:
new_features = features.groupby('ref_hash').mean()

In [8]:
new_features.head()

Unnamed: 0_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
40621409780134,7894.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,32279.0,0.0,186.0,0.0,0.0,7005.0,363.0,364.0,7.0
41863526108385,5760.333333,23402.0,11.333333,0.0,11.0,0.333333,1.0,51.0,127196.666667,0.0,194.0,0.0,1.0,5488.666667,1.666667,1.666667,1.0
69039685746313,8173.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,12692.0,1.0,192.0,0.0,0.0,8178.5,287.0,151.0,287.0
90072729247980,8452.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,192523.0,0.0,198.0,0.0,0.0,9352.0,0.0,0.0,0.0
135153013040192,114.0,70659.0,7.0,0.0,7.0,1.0,1.0,0.0,0.0,0.0,198.0,0.0,0.0,7632.0,0.0,0.0,0.0


In [9]:
targets = pd.read_csv('targets.csv').set_index(group)

In [10]:
targets.isnull().sum()

target_st    0
target_sc    0
dtype: int64

In [11]:
training_set = features.merge(targets, how='inner', left_index=True,right_index=True)

In [12]:
training_set.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event,target_st,target_sc
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4881735474151208235,1,8033.0,0.0,0.0,0.0,0.0,False,0.0,2.0,23418.0,False,198.0,0.0,0.0,7632.0,211.0,7.0,0.0,259200.0,259200.0
4679881814211948892,1,45.0,156639.0,5.0,0.0,0.0,False,2.0,0.0,0.0,False,25.335,1.0,0.0,7632.0,0.0,0.0,0.0,63602.0,259200.0
4177895333092974918,1,3990.0,119428.0,15.0,0.0,0.0,False,2.0,0.0,0.0,False,198.0,0.0,0.0,7632.0,0.0,0.0,0.0,259200.0,259200.0
4312326900897087220,1,4703.0,10305.0,53.0,0.0,22.0,True,4.0,0.0,0.0,False,198.0,0.0,0.0,7632.0,0.0,0.0,0.0,8056.0,259200.0
4911541169216784265,2,7894.0,0.0,0.0,0.0,0.0,False,0.0,14.0,69048.0,False,186.0,0.0,1.0,40.0,14.0,13.0,19.0,259200.0,259200.0


In [13]:
x_st = training_set[[x for x in training_set.columns if x not in targets.columns]]
y_st = training_set['target_st']

x_sc = training_set[[x for x in training_set.columns if x not in targets.columns]]
y_sc = training_set['target_sc']

---
## Training 

In [14]:
xtrain_st, xtest_st, ytrain_st, ytest_st = train_test_split(x_st, y_st, train_size=0.7)
xtrain_sc, xtest_sc, ytrain_sc, ytest_sc = train_test_split(x_sc, y_sc, train_size=0.7)

In [15]:
model_st = xgb.XGBRegressor(objective='reg:squarederror')
model_sc = xgb.XGBRegressor(objective='reg:squarederror')

model_st.fit(xtrain_st, ytrain_st)
model_sc.fit(xtrain_sc, ytrain_sc)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [None]:
def xgb_model_select(X, y):  
    print ('Select Model...')
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'objective':['reg:squarederror'],'n_estimators': [i for i in range(20,110,30)], 'max_depth':[i for i in range (3,6)]}
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=4, scoring='neg_mean_squared_error')
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print ('Select Done..., Time Cost: %d' % ((end_time - start_time).seconds) )

In [None]:
xgb_model_select(x_sc,y_sc)

In [16]:
ypred_st = model_st.predict(xtest_st)
print("Error St.:", sqrt(mean_squared_error(ytest_st,ypred_st)))

ypred_sc = model_sc.predict(xtest_sc)
print("Error Sc.:", sqrt(mean_squared_error(ytest_sc,ypred_sc)))

Error St.: 83474.15593257314
Error Sc.: 50173.18506919942


In [17]:
model_st.feature_importances_

array([0.05539307, 0.17030178, 0.55616677, 0.00647278, 0.15716563,
       0.00169483, 0.00576718, 0.0016135 , 0.00348918, 0.00254951,
       0.00071846, 0.01198499, 0.        , 0.00168364, 0.01490699,
       0.0070548 , 0.00303694], dtype=float32)

In [18]:
model_sc.feature_importances_

array([0.00897633, 0.01655433, 0.00974001, 0.0063948 , 0.0079583 ,
       0.01149138, 0.00731163, 0.73167884, 0.10984632, 0.00589204,
       0.0057311 , 0.0123169 , 0.01029207, 0.0239522 , 0.01101724,
       0.01092913, 0.00991735], dtype=float32)

---
## Prediccion

In [19]:
# Labels a submitir con las predicciones

to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv',usecols=['ref_hash'])
to_predict.head()

Unnamed: 0,ref_hash
0,1000169251625791246_sc
1,1000169251625791246_st
2,1000395625957344683_sc
3,1000395625957344683_st
4,1003027494996471685_sc


In [22]:
X = new_features

In [23]:
y_st = model_st.predict(X)
y_sc = model_sc.predict(X)

In [24]:
results_st = X.reset_index()['ref_hash'].transform(lambda x: str(x) + '_st').rename('ref_hash').to_frame()
results_st['obj'] = y_st

results_sc = X.reset_index()['ref_hash'].transform(lambda x: str(x) + '_sc').rename('ref_hash').to_frame()
results_sc['obj'] = y_sc

results = results_st.append(results_sc)

In [25]:
targets_mean = targets.reset_index()
targets_mean = targets_mean.loc[~targets_mean['ref_hash'].isin(X.reset_index()['ref_hash'])].groupby('ref_hash')[['target_st','target_sc']].mean().reset_index()

t_st = targets_mean['ref_hash'].transform(lambda x: str(x) + '_st').rename('ref_hash').to_frame()
t_st['obj'] = targets_mean['target_st']

t_sc = targets_mean['ref_hash'].transform(lambda x: str(x) + '_sc').rename('ref_hash').to_frame()
t_sc['obj'] = targets_mean['target_sc']

r = t_st.append(t_sc)

In [26]:
results = results.append(r)

In [27]:
to_predict.merge(results, on='ref_hash', how='left')['obj'].count()

8056

In [28]:
#Armado de File Final
max_secs = 3*24*60*60
to_predict.merge(results, on='ref_hash', how='left').fillna(max_secs).to_csv('predictions_xgb1.csv', header=True, index=False)

In [29]:
to_predict.count()

ref_hash    8074
dtype: int64