In [1]:
import pandas as pd 
df = pd.read_csv("clean-train-data.csv") 
df.head(5)

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,effectiveness_rating,number_of_times_prescribed,day,year,month,sentiment,base_score
0,206461,2065,296,9,27,20,2012,5,-0.296,8.022969
1,95260,921,0,8,192,27,2010,4,0.8603,7.858458
2,92703,1210,71,5,17,14,2009,12,0.7645,6.341969
3,35696,332,373,9,37,27,2016,11,0.9403,6.590176
4,155963,436,66,2,43,28,2015,11,-0.8187,6.144782


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

  _nan_object_mask = _nan_object_array != _nan_object_array


In [3]:
params = {
    'booster': 'gbtree', 
    'objective': 'reg:linear',
    'subsample': 0.8, 
    'colsample_bytree': 0.85, 
    'eta': 0.01, 
    'max_depth': 16, 
    'seed': 42}

In [4]:
X = df.iloc[:, 1:9].values
y = df.iloc[:, 9].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state=0)
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

watchlist = [(dtrain, 'train'), (dtest, 'test')]

In [5]:
xgb_model = xgb.train(params, dtrain, 2000, evals = watchlist,
                      early_stopping_rounds = 50, verbose_eval = True)

[15:03:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=16
[0]	train-rmse:6.30527	test-rmse:6.29773
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[15:03:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned nodes, max_depth=16
[1]	train-rmse:6.24448	test-rmse:6.23688
[15:03:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 0 pruned nodes, max_depth=12
[2]	train-rmse:6.18416	test-rmse:6.17677
[15:03:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 262 extra nodes, 0 pruned nodes, max_depth=16
[3]	train-rmse:6.12312	test-rmse:6.11576
[15:03:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 258 extra nodes, 0 pruned nodes, max_depth=16
[4]	train-rmse:6.06269	test-rmse:6.05534
[15:03:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 266 extra nodes

In [6]:
df = pd.read_csv('clean-test-data.csv')
test = df.iloc[:, 1:9].values

base_score = xgb_model.predict(xgb.DMatrix(test))

my_submission = pd.DataFrame({'patient_id': df.patient_id, 'base_score': base_score})
my_submission.to_csv('final_submission_xgboost-tune.csv', index=False)

In [7]:
#Feature Importance
print(xgb_model.get_score(importance_type='gain'))

{'f2': 8.303178461749486, 'f1': 0.04443164497866751, 'f4': 0.039705511853832365, 'f0': 0.032367332456646886, 'f3': 6.7960900860694125, 'f5': 0.052047931712499086, 'f7': 0.07144746882484697, 'f6': 0.04274375949117718}
