In [115]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score

In [117]:
#Importing Dataset

df= pd.read_csv("/users/fara/Downloads/playground-series-s5e5/train.csv")
df.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


# Exploratory Data Analysis

In [120]:
df.shape

(750000, 9)

In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [124]:
df.isna().sum()

#data is clean, no imputation required

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [126]:
#creating X Y vectors

X= df.drop(columns= ['id', 'Calories'])
X

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,male,36,189.0,82.0,26.0,101.0,41.0
1,female,64,163.0,60.0,8.0,85.0,39.7
2,female,51,161.0,64.0,7.0,84.0,39.8
3,male,20,192.0,90.0,25.0,105.0,40.7
4,female,38,166.0,61.0,25.0,102.0,40.6
...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9
749996,female,64,165.0,63.0,18.0,92.0,40.5
749997,male,60,162.0,67.0,29.0,113.0,40.9
749998,male,45,182.0,91.0,17.0,102.0,40.3


In [128]:
Y= df[['Calories']]
Y

Unnamed: 0,Calories
0,150.0
1,34.0
2,29.0
3,140.0
4,146.0
...,...
749995,230.0
749996,96.0
749997,221.0
749998,109.0


In [130]:
# converting categorical column 'Sex' as Male:0, female:1

Sex_dum= {"male":0, "female": 1}

X["Sex"].replace(Sex_dum, inplace=True)

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         750000 non-null  int64  
 1   Age         750000 non-null  int64  
 2   Height      750000 non-null  float64
 3   Weight      750000 non-null  float64
 4   Duration    750000 non-null  float64
 5   Heart_Rate  750000 non-null  float64
 6   Body_Temp   750000 non-null  float64
dtypes: float64(5), int64(2)
memory usage: 40.1 MB


In [134]:
X.describe().transpose().round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sex,750000.0,0.5,0.5,0.0,0.0,1.0,1.0,1.0
Age,750000.0,41.42,15.18,20.0,28.0,40.0,52.0,79.0
Height,750000.0,174.7,12.82,126.0,164.0,174.0,185.0,222.0
Weight,750000.0,75.15,13.98,36.0,63.0,74.0,87.0,132.0
Duration,750000.0,15.42,8.35,1.0,8.0,15.0,23.0,30.0
Heart_Rate,750000.0,95.48,9.45,67.0,88.0,95.0,103.0,128.0
Body_Temp,750000.0,40.04,0.78,37.1,39.6,40.3,40.7,41.5


In [136]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test= train_test_split(X,Y,test_size=0.25, random_state=0)

In [138]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((562500, 7), (562500, 1), (187500, 7), (187500, 1))

In [141]:
from sklearn.metrics import mean_squared_log_error, make_scorer

def rmsle(y_true,y_pred):
    print("in rmsle")
    y_true= np.maximum(y_true,0)
    y_pred= np.maximum(y_pred,0)

    
    return np.sqrt(mean_squared_log_error(y_true,y_pred))



In [143]:
# Model : Extreme Gradient Boosting training

from xgboost import XGBRegressor
XGB= XGBRegressor(n_estimators=500,
                learning_rate= 0.03,
                max_depth=8,
                subsample= 0.8,
                )

XGB.fit(x_train,y_train)

In [144]:
# Predicting test values

y_pred= XGB.predict(x_test)

print(y_pred, y_pred.shape, type(y_pred))

[213.30106  161.08998   40.160717 ... 257.6998    94.64006    8.381494] (187500,) <class 'numpy.ndarray'>


In [145]:
# Ensuring y_test is in the required format

ytest_= y_test.values.ravel()
print(ytest_,  ytest_.shape, type(ytest_))

[216. 161.  36. ... 264.  91.   9.] (187500,) <class 'numpy.ndarray'>


In [146]:
x=rmsle(ytest_, y_pred)
print(x)

in rmsle
0.06091630768989691


In [147]:
target= Y.values.ravel()
print(target.shape, type(target))

(750000,) <class 'numpy.ndarray'>


In [154]:
# since we dont have test labels, cross validation can help determine model performance. 
# This is a 5 fold cross validation with a custom metric wrapped under make_scorer

def cross_val(model):
    sc=cross_val_score(model, X, target, cv=5, scoring= make_scorer(rmsle), n_jobs=-1)
    print(f'{model}')
    print(sc)
    print(np.mean(sc).round(4))

In [156]:
cross_val(XGB)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.03, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=8,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=500,
             n_jobs=None, num_parallel_tree=None, ...)
[0.06047822 0.06179863 0.06258578 0.06115557 0.05977571]
0.0612


In [158]:
#Reading the test dataframe

x_test= pd.read_csv("/users/fara/Downloads/playground-series-s5e5/test.csv")

x_test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [160]:
#preprocessing feature Sex

Sex_dum= {"male":0,
      "female": 1}
x_test["Sex"].replace(Sex_dum, inplace=True)
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          250000 non-null  int64  
 1   Sex         250000 non-null  int64  
 2   Age         250000 non-null  int64  
 3   Height      250000 non-null  float64
 4   Weight      250000 non-null  float64
 5   Duration    250000 non-null  float64
 6   Heart_Rate  250000 non-null  float64
 7   Body_Temp   250000 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 15.3 MB


In [162]:
x_test.describe().transpose().round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,250000.0,874999.5,72168.93,750000.0,812499.75,874999.5,937499.25,999999.0
Sex,250000.0,0.5,0.5,0.0,0.0,1.0,1.0,1.0
Age,250000.0,41.45,15.18,20.0,28.0,40.0,52.0,79.0
Height,250000.0,174.73,12.82,127.0,164.0,174.0,185.0,219.0
Weight,250000.0,75.15,13.98,39.0,63.0,74.0,87.0,126.0
Duration,250000.0,15.42,8.35,1.0,8.0,15.0,23.0,30.0
Heart_Rate,250000.0,95.48,9.45,67.0,88.0,95.0,103.0,128.0
Body_Temp,250000.0,40.04,0.78,37.1,39.6,40.3,40.6,41.5


In [164]:
# Predicting test values

y_pred= XGB.predict(x_test.drop(columns=['id']))

### Gradient Boosting hyperparameter Tuning

In [176]:
#importing libraries

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params_dist={'n_estimators':[500, 800],
'subsample' : [0.8],
'colsample_bytree': [0.6, 0.8, 1.0],
'gamma': [1,3,5],
'reg_alpha': [0.1, 0.5,1],
'reg_lambda': [0.5, 1, 5],
'max_depth': [3,5,7],
'learning_rate': [0.02, 0.03,0.04]}

model= XGBRegressor()

rgs= RandomizedSearchCV(estimator=model, cv=4, verbose=2, n_jobs=-1,
                        scoring= make_scorer(rmsle, greater_is_better=False), n_iter= 30,param_distributions= params_dist)





In [178]:
rgs.fit(X,target)

Fitting 4 folds for each of 30 candidates, totalling 120 fits


In [180]:
rgs.best_params_

{'subsample': 0.8,
 'reg_lambda': 1,
 'reg_alpha': 0.5,
 'n_estimators': 800,
 'max_depth': 7,
 'learning_rate': 0.04,
 'gamma': 1,
 'colsample_bytree': 0.8}

In [182]:
rgs.best_score_

-0.061588701381884256

In [184]:
print(rgs.scorer_)

make_scorer(rmsle, greater_is_better=False, response_method='predict')


In [186]:
best_params = rgs.best_params_

# Create and train final model with best parameters
final_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
final_model.fit(X, target)  # Fit on the **entire dataset**


In [187]:
cross_val(final_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=1, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.04, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=800,
             n_jobs=None, num_parallel_tree=None, ...)
[0.06078928 0.06236449 0.06290459 0.06139645 0.05994398]
0.0615


In [167]:
import pandas as pd
import matplotlib.pyplot as plt

# Get feature importance scores
feature_importance = final_model.feature_importances_

# Create a DataFrame to sort and display
feat_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(feat_df)  # Top 10 important features


      Feature  Importance
4    Duration    0.600339
5  Heart_Rate    0.241387
6   Body_Temp    0.115963
1         Age    0.021634
0         Sex    0.015401
3      Weight    0.004741
2      Height    0.000534


In [189]:
selected_features= ['Duration', 'Heart_Rate', 'Body_Temp']

In [190]:
final_model.fit(X[selected_features], target)

In [191]:
cross_val(final_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=1, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.04, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=800,
             n_jobs=None, num_parallel_tree=None, ...)
[0.06078928 0.06236449 0.06290459 0.06139645 0.05994398]
0.0615


In [192]:
x_test= pd.read_csv("/users/fara/Downloads/playground-series-s5e5/test.csv")

x_test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [200]:
xtest_final= x_test[['id','Duration', 'Heart_Rate', 'Body_Temp']]
xtest_final.head()

Unnamed: 0,id,Duration,Heart_Rate,Body_Temp
0,750000,7.0,87.0,39.8
1,750001,20.0,101.0,40.5
2,750002,16.0,102.0,40.4
3,750003,20.0,107.0,40.6
4,750004,16.0,94.0,40.5


In [202]:
y_pred1= final_model.predict(xtest_final.drop(columns=['id']))

In [204]:
y_pred= np.maximum(0,y_pred1)

In [206]:
submission= pd.DataFrame({"id": xtest_final['id'],
                       "calories": y_pred})

In [218]:
submission.to_csv('Calorie_prediction.csv', index=False)
