In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

In [63]:
df = pd.read_csv('final_dataset_clusters-2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 30 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   country                            420 non-null    object 
 1   updated_country                    420 non-null    object 
 2   Country Code (ISO 3166-1 alpha-3)  420 non-null    object 
 3   Year                               420 non-null    int64  
 4   gold                               420 non-null    int64  
 5   silver                             420 non-null    int64  
 6   bronze                             420 non-null    int64  
 7   total                              420 non-null    int64  
 8   medal_sum                          420 non-null    int64  
 9   check                              420 non-null    bool   
 10  GDP                                420 non-null    float64
 11  HDI                                420 non-null    float64

In [64]:
df.head()

Unnamed: 0,country,updated_country,Country Code (ISO 3166-1 alpha-3),Year,gold,silver,bronze,total,medal_sum,check,...,BMI_obesity,BMI_morbid_obesity,Mean_Height,Diabetes_in_18+,Diabetes_treated_in_30+,non-HDL_cholesterol,HDL_cholesterol,Raised_blood_pressure,Hypertension,cluster
0,Afghanistan,Afghanistan,AFG,2000,0,0,0,0,0,True,...,0.037869,0.002605,159.84242,0.124577,0.22728,3.129567,1.128193,0.285826,0.360907,1
1,Afghanistan,Afghanistan,AFG,2004,0,0,0,0,0,True,...,0.053626,0.003623,160.101298,0.139517,0.23002,3.140379,1.120109,0.294444,0.367908,1
2,Afghanistan,Afghanistan,AFG,2008,0,0,1,1,1,True,...,0.072227,0.004871,160.345587,0.156097,0.23191,3.139214,1.1106,0.299683,0.375714,1
3,Afghanistan,Afghanistan,AFG,2012,0,0,1,1,1,True,...,0.095673,0.006511,160.547954,0.175056,0.233189,3.127356,1.098825,0.302909,0.383675,1
4,Algeria,Algeria,DZA,2000,1,1,3,5,5,True,...,0.131282,0.00617,164.568663,0.120538,0.384386,3.656587,1.040695,0.303167,0.375078,1


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((336, 24), (84, 24))

In [66]:
df['weighted_score'] = (df['gold'] * 3) + (df['silver'] * 2) + (df['bronze'] * 1)
df_numeric = df.drop(columns=["country", "updated_country", "Country Code (ISO 3166-1 alpha-3)","gold","silver","bronze"])

X = df_numeric.drop(columns = 'weighted_score')
y = df_numeric['weighted_score']

scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [67]:
alphas = np.logspace(-3, 1, 50)
elastic_net = ElasticNetCV(l1_ratio=0.5, alphas=alphas, cv=5, random_state=42)
elastic_net.fit(X_train_scaled, y_train)

print(f"Optimal alpha: {elastic_net.alpha_}")
print(f"Number of non-zero coefficients: {np.sum(elastic_net.coef_ != 0)}")

y_pred = elastic_net.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")


Optimal alpha: 0.04291934260128776
Number of non-zero coefficients: 22
RMSE: 13.137922918090222
R²: 0.9804079989212029


In [68]:
y_train_pred = elastic_net.predict(X_train)

In [69]:
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)
print(f"RMSE: {train_rmse}")
print(f"R2: {train_r2}")

RMSE: 24908.215975609226
R2: -93158.79420654276


In [70]:
y_pred = elastic_net.predict(X_test)

In [71]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}") 
print(f"R2: {r2}") 

RMSE: 23089.194688145795
R2: -60511.200831970644


In [72]:
print(y)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

def evaluate_model(X, y, model, kf):
    r2_scores = []
    rmse_scores = []

    for train_index, test_index in kf.split(X):  
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2_scores.append(r2)
        rmse_scores.append(rmse)

    return np.mean(r2_scores), np.mean(rmse_scores) 

mean_r2, mean_rmse = evaluate_model(X, y, elastic_net, kf)
print(f"Mean R²: {mean_r2}")
print(f"Mean RMSE: {mean_rmse}")


0      0
1      0
2      1
3      1
4      8
      ..
415    0
416    0
417    6
418    9
419    0
Name: weighted_score, Length: 420, dtype: int64
Mean R²: 0.9804088183955202
Mean RMSE: 9.815819881827952
