In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

import xgboost as xgb
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('../data/CalorieConsumption/train.csv')
test_df = pd.read_csv('../data/CalorieConsumption/test.csv')

In [3]:
train_df = train_df.drop(['ID'], axis=1)
test_df = test_df.drop(['ID'], axis=1)

In [4]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [5]:
X = train_df.drop(['Calories_Burned'],axis= 1)
y = train_df['Calories_Burned']

In [6]:
# 최적값을 구하고 싶은 파라미터를 정리 
dists = {
    'xgboostregressor__n_estimators' : [230,240,250,260,270,280],
    'xgboostregressor__max_depth' : [5,6,7],
    'xgboostregressor__gamma' : [0, 1, 2],
    'xgboostregressor__eta' : [0.01, 0.02, 0.03, 0.04, 0.05],
    'xgboostregressor__subsample' : [0.7,0.75, 0.8],
    'xgboostregressor__mean_child_weight' : [0.9, 1, 1.1],
    'xgboostregressor__importance_type' : ['gain'],
    'xgboostregressor__reg_lambda' : [0.9, 1, 1.1],
}

# RandomizedSearchCV 작성
regressor = RandomizedSearchCV(
    XGBRegressor(random_state=100),
    param_distributions=dists, # 파라미터 입력
    n_iter = 50,   # random search 탐색 횟수
    cv = 5,        # cv 검증을 위한 분할 검증 횟수
    scoring='neg_mean_squared_error',  # 오차 평가방법
    verbose=1,     # 진행상황
    random_state = 100
  )

regressor.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Parameters: { "xgboostregressor__eta", "xgboostregressor__gamma", "xgboostregressor__importance_type", "xgboostregressor__max_depth", "xgboostregressor__mean_child_weight", "xgboostregressor__n_estimators", "xgboostregressor__reg_lambda", "xgboostregressor__subsample" } are not used.

Parameters: { "xgboostregressor__eta", "xgboostregressor__gamma", "xgboostregressor__importance_type", "xgboostregressor__max_depth", "xgboostregressor__mean_child_weight", "xgboostregressor__n_estimators", "xgboostregressor__reg_lambda", "xgboostregressor__subsample" } are not used.

Parameters: { "xgboostregressor__eta", "xgboostregressor__gamma", "xgboostregressor__importance_type", "xgboostregressor__max_depth", "xgboostregressor__mean_child_weight", "xgboostregressor__n_estimators", "xgboostregressor__reg_lambda", "xgboostregressor__subsample" } are not used.

Parameters: { "xgboostregressor__eta", "xgboostregressor__gamma", "xgboostregres

In [7]:
best_params = regressor.best_params_
best_params

{'xgboostregressor__subsample': 0.7,
 'xgboostregressor__reg_lambda': 1.1,
 'xgboostregressor__n_estimators': 250,
 'xgboostregressor__mean_child_weight': 1.1,
 'xgboostregressor__max_depth': 6,
 'xgboostregressor__importance_type': 'gain',
 'xgboostregressor__gamma': 2,
 'xgboostregressor__eta': 0.04}

In [8]:
rgr = XGBRegressor(
            n_estimators=250,
            max_depth=6,
            gamma = 2,
            eta = 0.07,
            subsample = 0.6,
            min_child_weight = 1,
            reg_lambda = 1,
            random_state=100).fit(X,y)

In [9]:
scores = cross_val_score(rgr, X, y, scoring='neg_mean_squared_error', cv=5)
rmse = np.sqrt(-scores)
print('RMSE:', np.round(rmse, 3))
print('RMSE average: %0.3f' % (rmse.mean()))
print(rgr.score(X, y))

RMSE: [1.937 2.142 1.655 1.805 1.646]
RMSE average: 1.837
0.9997594524341591


In [10]:
prediction = rgr.predict(test_df)

In [11]:
for i, feature in enumerate(test_df.columns):
    print(f'{feature} : {rgr.feature_importances_[i]}')

Exercise_Duration : 0.9238816499710083
Body_Temperature(F) : 0.00020200919243507087
BPM : 0.04093547165393829
Height(Feet) : 0.0006939938175491989
Height(Remainder_Inches) : 0.00017940200632438064
Weight(lb) : 0.003673447761684656
Age : 0.018363798037171364
Weight_Status_Normal Weight : 0.00158427853602916
Weight_Status_Obese : 0.00018113479018211365
Weight_Status_Overweight : 0.000642169383354485
Gender_F : 0.009662585332989693
Gender_M : 0.0


In [12]:
submission = pd.read_csv('../data/CalorieConsumption/sample_submission.csv', index_col = 0)

In [13]:
submission['Calories_Burned'] = prediction
submission = submission.round(1)

In [14]:
submission.to_csv('submission02.csv')