# XGBoost

In [None]:
# Import file handling
import joblib

# Import numerical and dataframe handling
import numpy as np
import pandas as pd

# Import scikit-learn data utilities
from sklearn.model_selection import train_test_split

# Import model scoring
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Import XGBoost
import xgboost

# Import other
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold

## Data

In [None]:
data = pd.read_csv('../data/parkinsons_updrs.data')

In [None]:
male = data[data['sex'] == 0]
female = data[data['sex'] == 1]

In [None]:
train_features = ['age', 'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', 'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'Shimmer:APQ11', 'Shimmer:DDA', 'HNR']
target = ['motor_UPDRS', 'total_UPDRS']

## Model

### Male

In [None]:
scaler_male = MinMaxScaler((-1,1))

In [None]:
# Normalize data
data_normalized_male = scaler_male.fit_transform(male[train_features])

# Split data
test_size = 0.25

x_train_male, x_test_male, y_train_male, y_test_male = train_test_split(data_normalized_male, male[target], test_size=test_size)
x_train_male, x_val_male, y_train_male, y_val_male = train_test_split(x_train_male, y_train_male, test_size=test_size)

In [None]:
# Save scaler
# !mkdir -p ../saved_scalers
joblib.dump(scaler_male, '/work/saved_scalers/xgboost_male')

['/work/saved_scalers/xgboost_male']

In [None]:
# print(x_train_male.shape, y_train_male.shape)
# print(x_test_male.shape, y_test_male.shape)
# print(x_val_male.shape, y_val_male.shape)

In [None]:
# #Parameters to tinker with: n_estimators, max_depth (1-10), eta, subsample (0-1), colsample_bytree (0-1)
# model = xgboost.XGBRegressor(eta=.1)
# model.fit(x_train_male, y_train_male)

# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(model, x_train_male, y_train_male, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# scores = abs(scores)
# print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )
# print(r2_score(y_val_male, model.predict(x_val_male)))

In [None]:
# max_r2 = 0
# max_est = 0
# max_eta = 0
# r2 = []

# for i in range(1,50):
#     for k in np.arange(0,1, 0.05):
#         model = xgboost.XGBRegressor(n_estimators=i, eta=k)
#         model.fit(x_train_male, y_train_male)

#         r2_value = r2_score(y_val_male, model.predict(x_val_male))
#         r2.append(r2_value)

#         if r2_value > max_r2:
#             max_est = i
#             max_r2 = r2_value
#             max_eta = k

# print(max_r2, max_est, max_eta, r2)

# Max est: 44, Max eta: 0.15000000000000002

In [None]:
# max_r2 = 0
# max_d = 0

# for i in range(1,20):
#     model = xgboost.XGBRegressor(n_estimators=48, eta=0.15000000000000002, max_depth=i)
#     model.fit(x_train_male, y_train_male)

#     r2_value = r2_score(y_val_male, model.predict(x_val_male))
#     r2.append(r2_value)

#     if r2_value > max_r2:
#         max_d = i
#         max_r2 = r2_value

# print(max_r2, max_d)

### Female

In [None]:
scaler_female = MinMaxScaler((-1,1))

In [None]:
# Normalize data
data_normalized_female = scaler_female.fit_transform(female[train_features])

# Split data
test_size = 0.25

x_train_female, x_test_female, y_train_female, y_test_female = train_test_split(data_normalized_female, female[target], test_size=test_size)
x_train_female, x_val_female, y_train_female, y_val_female = train_test_split(x_train_female, y_train_female, test_size=test_size)

In [None]:
# Save scaler
# !mkdir -p ../saved_scalers
joblib.dump(scaler_female, '/work/saved_scalers/xgboost_female')

['/work/saved_scalers/xgboost_female']

In [None]:
# print(x_train_female.shape, y_train_female.shape)
# print(x_test_female.shape, y_test_female.shape)
# print(x_val_female.shape, y_val_female.shape)

In [None]:
# #Parameters to tinker with: n_estimators, max_depth (1-10), eta, subsample (0-1), colsample_bytree (0-1)
# model = xgboost.XGBRegressor(eta=.1)
# model.fit(x_train_female, y_train_female)

# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(model, x_train_female, y_train_female, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# scores = abs(scores)
# print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )
# print(r2_score(y_val_female, model.predict(x_val_female)))

In [None]:
# max_r2 = 0
# max_est = 0
# max_eta = 0
# r2 = []

# for i in range(1,50):
#     for k in np.arange(0,1,0.05):
#         model = xgboost.XGBRegressor(n_estimators=i, eta=k)
#         model.fit(x_train_female, y_train_female)

#         r2_value = r2_score(y_val_female, model.predict(x_val_female))
#         r2.append(r2_value)

#         if r2_value > max_r2:
#             max_est = i
#             max_r2 = r2_value
#             max_eta = k

# print(max_r2, max_est, max_eta)

# Max est: 18, Max eta: 0.35000000000000003

In [None]:
# max_r2 = 0
# max_d = 0

# for i in range(1,20):
#     model = xgboost.XGBRegressor(n_estimators=49, eta=0.1, max_depth=i)
#     model.fit(x_train_female, y_train_female)

#     r2_value = r2_score(y_val_female, model.predict(x_val_female))
#     r2.append(r2_value)

#     if r2_value > max_r2:
#         max_d = i
#         max_r2 = r2_value

# print(max_r2, max_d)

## Testing

In [None]:
model_female = xgboost.XGBRegressor(n_estimators=49, eta=0.1, max_depth=4)
model_female.fit(x_train_female, y_train_female)

female_predictions = model_female.predict(x_test_female)

In [None]:
model_male = xgboost.XGBRegressor(n_estimators=49, eta=0.15, max_depth=7)
model_male.fit(x_train_male, y_train_male)

male_predictions = model_male.predict(x_test_male)

In [None]:
# Save models
# !mkdir -p ../saved_models
joblib.dump(model_female, '/work/saved_models/xgboost_female')
joblib.dump(model_male, '/work/saved_models/xgboost_male')

['/work/saved_models/xgboost_male']

In [None]:
print('Male R2_Score:', r2_score(y_test_male, male_predictions))
print('Male MAE:', mean_absolute_error(y_test_male, male_predictions))
print('Male MSE:', mean_squared_error(y_test_male, male_predictions))
print('Male RMSE:', mean_squared_error(y_test_male, male_predictions, squared=False),'\n')

print('Female R2_Score:', r2_score(y_test_female, female_predictions))
print('Female MAE:', mean_absolute_error(y_test_female, female_predictions))
print('Female MSE:', mean_squared_error(y_test_female, female_predictions))
print('Female RMSE:', mean_squared_error(y_test_female, female_predictions, squared=False))

Male R2_Score: 0.9038128322405257
Male MAE: 2.166250101843663
Male MSE: 8.609510825774194
Male RMSE: 2.9040892804148264 

Female R2_Score: 0.9060637578606281
Female MAE: 2.2711339064800242
Female MSE: 8.319370597285047
Female RMSE: 2.8784916064363575


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=02f714e5-f372-44d9-9c53-987f6bee66dd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>