In [None]:
import pandas as pd
import matplotlib as mp
import numpy as np
import datetime
import pprint
from math import sqrt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'
group=['ref_hash','window_nr']

---
## Read de Features

In [None]:
features = pd.read_csv('training_set.csv',low_memory=False).set_index(group)

In [None]:
features.head()

In [None]:
targets = pd.read_csv('targets.csv').set_index(group)

In [None]:
targets.isnull().sum()

In [None]:
training_set = features.merge(targets, how='inner', left_index=True,right_index=True)

In [None]:
training_set.head()

In [None]:
x_st = training_set[[x for x in training_set.columns if x not in targets.columns]]
y_st = training_set['target_st']

x_sc = training_set[[x for x in training_set.columns if x not in targets.columns]]
y_sc = training_set['target_sc']

---
## Training 

In [None]:
xtrain_st, xtest_st, ytrain_st, ytest_st = train_test_split(x_st, y_st, train_size=0.7)
xtrain_sc, xtest_sc, ytrain_sc, ytest_sc = train_test_split(x_sc, y_sc, train_size=0.7)

In [None]:
model_st = xgb.XGBRegressor(objective='reg:squarederror')
model_sc = xgb.XGBRegressor(objective='reg:squarederror')

model_st.fit(xtrain_st, ytrain_st)
model_sc.fit(xtrain_sc, ytrain_sc)

In [None]:
def xgb_model_select(X, y):  
    print ('Select Model...')
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'objective':['reg:squarederror'],'n_estimators': [i for i in range(20,110,30)], 'max_depth':[i for i in range (3,6)]}
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=4, scoring='neg_mean_squared_error')
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print ('Select Done..., Time Cost: %d' % ((end_time - start_time).seconds) )

In [None]:
xgb_model_select(x_sc,y_sc)

In [None]:
ypred_st = model_st.predict(xtest_st)
print("Error St.:", sqrt(mean_squared_error(ytest_st,ypred_st)))

ypred_sc = model_sc.predict(xtest_sc)
print("Error Sc.:", sqrt(mean_squared_error(ytest_sc,ypred_sc)))

In [None]:
model_st.feature_importances_

In [None]:
model_sc.feature_importances_

---
## Prediccion

In [None]:
# Labels a submitir con las predicciones

to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv',usecols=['ref_hash'])
to_predict.head()

In [None]:
X = features.reset_index()
X = X.loc[X['window_nr'] == 3].set_index(group)

In [None]:
y_st = model_st.predict(X)
y_sc = model_sc.predict(X)

In [None]:
results_st = X.reset_index()['ref_hash'].transform(lambda x: str(x) + '_st').rename('ref_hash').to_frame()
results_st['obj'] = y_st

results_sc = X.reset_index()['ref_hash'].transform(lambda x: str(x) + '_sc').rename('ref_hash').to_frame()
results_sc['obj'] = y_sc

results = results_st.append(results_sc)

In [None]:
targets_mean = targets.reset_index()
targets_mean = targets_mean.loc[~targets_mean['ref_hash'].isin(X.reset_index()['ref_hash'])].groupby('ref_hash')[['target_st','target_sc']].mean().reset_index()

t_st = targets_mean['ref_hash'].transform(lambda x: str(x) + '_st').rename('ref_hash').to_frame()
t_st['obj'] = targets_mean['target_st']

t_sc = targets_mean['ref_hash'].transform(lambda x: str(x) + '_sc').rename('ref_hash').to_frame()
t_sc['obj'] = targets_mean['target_sc']

r = t_st.append(t_sc)

In [None]:
results = results.append(r)

In [None]:
to_predict.merge(results, on='ref_hash', how='left')['obj'].count()

In [None]:
#Armado de File Final
max_secs = 3*24*60*60
to_predict.merge(results, on='ref_hash', how='left').fillna(max_secs).to_csv('predictions_xgb.csv', header=True, index=False)

In [None]:
to_predict.count()