In [56]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
import joblib  # For saving the model

#from library.sb_utils import save_file

In [68]:
# Load the current and historical datasets

current_data_file_path = '../data/processed/data_final.csv'  # Update the path to your file
data = pd.read_csv(current_data_file_path)

In [69]:
display(data.head())

Unnamed: 0,Year,Country,gold_medal,silver_medal,bronze_medal,total_medal,type_olympic,host_flag,Country_,gold_medal_sum,...,primary_enrollment_ratio,secondary_enrollment_ratio,tertiary_enrollment_ratio,internet_penetration,mobile_penetration,health_expenditure,Income Group_Low income,Income Group_Lower middle income,Income Group_Upper middle income,host_flag_1
0,2022.0,AUS,1.0,2.0,1.0,4.0,w,0.0,AUS,172.0,...,99.132988,133.277267,106.240761,96.24,107.031203,10.543639,0.0,0.0,0.0,0.0
1,2022.0,AUT,7.0,7.0,4.0,18.0,w,0.0,AUT,92.0,...,100.775627,101.458,93.940071,93.614091,123.434807,12.1,0.0,0.0,0.0,0.0
2,2022.0,BEL,1.0,0.0,1.0,2.0,w,0.0,BEL,45.0,...,101.821136,143.163513,82.688202,94.007831,101.870773,11.042908,0.0,0.0,0.0,0.0
3,2022.0,BLR,0.0,2.0,0.0,2.0,w,0.0,BLR,23.0,...,94.711967,94.535896,70.867569,89.507331,123.447255,6.57,0.0,0.0,1.0,0.0
4,2022.0,CAN,4.0,8.0,14.0,26.0,w,0.0,CAN,150.0,...,96.293083,109.334312,77.802292,92.834017,91.230625,11.154714,0.0,0.0,0.0,0.0


In [59]:
# Define numerical features
features = ['gold_medal', 'silver_medal', 'bronze_medal', 'NY.GDP.MKTP.CD', 
            'AG.LND.TOTL.K2', 'health_expenditure', 'gdp_population_interaction', 
            'SP.POP.TOTL', 'gdp_per_capita', 'SP.DYN.LE00.IN', 'tertiary_enrollment_ratio',
            'urbanization_rate', 'internet_penetration', 'employment_to_population_ratio',
            'gold_medal_mean', 'silver_medal_mean', 'bronze_medal_mean', 'total_medal_mean',
            'host_flag_sum', 'Income Group_Low income', 'Income Group_Lower middle income',
            'Income Group_Upper middle income', 'host_flag_1']

In [70]:
data = data[data['Country']!='USA']
data.shape

(118, 60)

In [71]:
# Define the target variable
target = data['total_medal']

pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

X_poly_scaled = pipeline.fit_transform(data[features])

# Split the data into training and testing sets
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly_scaled, target, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
    #,
    #'LightGBM': LGBMRegressor(random_state=42)
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train_poly, y_train)
    y_pred = model.predict(X_test_poly)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MAE': mae, 'R²': r2}

# Convert the results to a DataFrame for better visualization
results_df = pd.DataFrame(results).T

# Display the evaluation results
print("Model Evaluation Results")
print(results_df)

Model Evaluation Results
                                  MAE        R²
Linear Regression            0.333069  0.997206
Random Forest Regressor      2.106667  0.880511
Gradient Boosting Regressor  2.162656  0.886555
XGBoost                      1.644027  0.945845


In [72]:
# Perform Randomized Search for hyperparameter tuning for Gradient Boosting Regressor
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Initialize and fit RandomizedSearchCV
random_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), param_dist, n_iter=100, cv=5, scoring='r2', n_jobs=-1, random_state=42)
random_search.fit(X_train_poly, y_train)

# Get the best parameters and train the refined model
best_params_refined = random_search.best_params_
best_gbr_model_refined = GradientBoostingRegressor(**best_params_refined, random_state=42)
best_gbr_model_refined.fit(X_train_poly, y_train)

y_pred_best = best_gbr_model_refined.predict(X_test_poly)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best Parameters: {best_params_refined}")
print(f"Tuned Gradient Boosting Regressor with Polynomial Features - MAE: {mae_best:.2f}, R²: {r2_best:.2f}")
#print(f"Cross-validation R²: {mean_cv_score:.2f}"

Best Parameters: {'n_estimators': 100, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.2}
Tuned Gradient Boosting Regressor with Polynomial Features - MAE: 1.66, R²: 0.94


In [73]:
# Initialize RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}
random_search = RandomizedSearchCV(estimator=XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=30, cv=5, scoring='r2', n_jobs=-1, random_state=42, verbose=2)
random_search.fit(X_train_poly, y_train)

# Best parameters and model evaluation
best_params = random_search.best_params_
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test_poly)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

# Perform k-fold cross-validation
cv_scores = cross_val_score(best_model, X_poly_scaled, target, cv=10, scoring='r2')
mean_cv_score = np.mean(cv_scores)

print(f"Best Parameters: {best_params}")
print(f"Tuned XGBoost with Polynomial Features - MAE: {mae_best:.2f}, R²: {r2_best:.2f}")
print(f"Cross-validation R²: {mean_cv_score:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.8}
Tuned XGBoost with Polynomial Features - MAE: 0.97, R²: 0.98
Cross-validation R²: 0.84


In [74]:
best_model = random_search.best_estimator_
best_model.version = '1.0'
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.sklearn_version = sklearn.__version__
best_model.X_columns = [col for col in enhanced_data[enhanced_features].columns]


In [75]:
# save the model

#modelpath = '../models'
#save_file(best_model, 'ski_resort_pricing_model.pkl', modelpath)

joblib.dump(best_model, '../models/best_model.pkl')

['../models/best_model.pkl']

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.2, max_depth=4, n_estimators=200, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.2, max_depth=3, n_estimators=200, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_depth=5, n_estimators=300, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_depth=5, n_estimators=300, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.2, max_depth=3, n_estimators=200, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8; total time=   0.2s
[C

[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.2, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.2, max_depth=3, n_estimators=200, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.9; total time=   