## Random Forest Regressor

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('final_dataset_clusters-2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 30 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   country                            420 non-null    object 
 1   updated_country                    420 non-null    object 
 2   Country Code (ISO 3166-1 alpha-3)  420 non-null    object 
 3   Year                               420 non-null    int64  
 4   gold                               420 non-null    int64  
 5   silver                             420 non-null    int64  
 6   bronze                             420 non-null    int64  
 7   total                              420 non-null    int64  
 8   medal_sum                          420 non-null    int64  
 9   check                              420 non-null    bool   
 10  GDP                                420 non-null    float64
 11  HDI                                420 non-null    float64

In [5]:
df.head()

Unnamed: 0,country,updated_country,Country Code (ISO 3166-1 alpha-3),Year,gold,silver,bronze,total,medal_sum,check,...,BMI_obesity,BMI_morbid_obesity,Mean_Height,Diabetes_in_18+,Diabetes_treated_in_30+,non-HDL_cholesterol,HDL_cholesterol,Raised_blood_pressure,Hypertension,cluster
0,Afghanistan,Afghanistan,AFG,2000,0,0,0,0,0,True,...,0.037869,0.002605,159.84242,0.124577,0.22728,3.129567,1.128193,0.285826,0.360907,1
1,Afghanistan,Afghanistan,AFG,2004,0,0,0,0,0,True,...,0.053626,0.003623,160.101298,0.139517,0.23002,3.140379,1.120109,0.294444,0.367908,1
2,Afghanistan,Afghanistan,AFG,2008,0,0,1,1,1,True,...,0.072227,0.004871,160.345587,0.156097,0.23191,3.139214,1.1106,0.299683,0.375714,1
3,Afghanistan,Afghanistan,AFG,2012,0,0,1,1,1,True,...,0.095673,0.006511,160.547954,0.175056,0.233189,3.127356,1.098825,0.302909,0.383675,1
4,Algeria,Algeria,DZA,2000,1,1,3,5,5,True,...,0.131282,0.00617,164.568663,0.120538,0.384386,3.656587,1.040695,0.303167,0.375078,1


In [21]:
# target feature = weighted medal scores

df['weighted_score'] = (df['gold'] * 3) + (df['silver'] * 2) + (df['bronze'] * 1)
df_numeric = df.drop(columns=["country", "updated_country", "Country Code (ISO 3166-1 alpha-3)","check"])
X = df_numeric.drop(columns = ['weighted_score','gold','silver','bronze','total','medal_sum'])
y = df_numeric['weighted_score']
X.shape, y.shape

((420, 21), (420,))

In [32]:
# X.describe(), y.describe()
X.columns

Index(['Year', 'GDP', 'HDI', 'LE', 'EYS', 'MYS', 'GNIPC', 'MMR', 'PHDI',
       'CO2_PROD', 'BMI_underweight', 'BMI_obesity', 'BMI_morbid_obesity',
       'Mean_Height', 'Diabetes_in_18+', 'Diabetes_treated_in_30+',
       'non-HDL_cholesterol', 'HDL_cholesterol', 'Raised_blood_pressure',
       'Hypertension', 'cluster'],
      dtype='object')

# Robust Scaling 

In [26]:
scaler = RobustScaler() 
X_scaled = scaler.fit_transform(X) 

In [10]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# X_train.shape, X_test.shape

In [28]:
print(X_scaled)
print(X_scaled.min())
print(X_scaled.max())

[[-1.00000000e+00 -3.64725472e-01 -2.37522506e+00 ...  4.19657670e-01
  -2.24362220e-01  1.00000000e+00]
 [-3.33333333e-01 -3.70425624e-01 -2.21822110e+00 ...  5.62760619e-01
  -1.39792970e-01  1.00000000e+00]
 [ 3.33333333e-01 -3.68149876e-01 -1.92725963e+00 ...  6.49744544e-01
  -4.55125166e-02  1.00000000e+00]
 ...
 [-3.33333333e-01  2.17711762e-01 -1.85523947e+00 ...  5.13686760e-01
   3.41414169e-01  1.00000000e+00]
 [ 3.33333333e-01  3.40567222e-02 -1.85379906e+00 ...  4.50389463e-01
   5.06342502e-01  1.00000000e+00]
 [ 1.00000000e+00  9.64040627e-04 -1.60460929e+00 ...  3.84928104e-01
   5.70079763e-01  1.00000000e+00]]
-3.956611570247938
59.22588817796017


In [30]:
y.max()

680

## 1. K-fold cross validation
## 2. Trained Random forest regressor
## 3. Evaluated the metrics for both training and testing to check for the presence of overfitting

In [78]:
# K fold cross validation 
# Training Random forest regressor model 
# Evaluating both training and testing metrics (RMSE and R2)

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
rf = RandomForestRegressor(n_estimators = 100, random_state = 42) 
rmse_scores = []
r2_scores = []
rmse_train_scores = []
r2_train_scores = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf.fit(X_train, y_train)
 
    y_pred = rf.predict(X_test)

    train_pred = rf.predict(X_train)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  
    r2 = r2_score(y_test, y_pred)

    rmse_train = np.sqrt(mean_squared_error(y_train, train_pred))
    r2_train = r2_score(y_train, train_pred)

    rmse_train_scores.append(rmse_train)
    r2_train_scores.append(r2_train)
    
    rmse_scores.append(rmse)
    r2_scores.append(r2)


print("Training Results")
print(f"Average RMSE: {np.mean(rmse_train_scores)}")
print(f"Average R2: {np.mean(r2_train_scores)}")
print('-------------------------------')
print("Testing Results")
print(f"Average RMSE: {np.mean(rmse_scores)}")
print(f"Average R2: {np.mean(r2_scores)}")

Training Results
Average RMSE: 22.305231675483885
Average R2: 0.929461489095055
-------------------------------
Testing Results
Average RMSE: 55.1427465706868
Average R2: 0.5307756679433148


## Without Scaling 

In [46]:
X1 = df_numeric.drop(columns = ['weighted_score','gold','silver','bronze','total','medal_sum']) 
y1 = df_numeric['weighted_score']

In [56]:
# K fold cross validation 
# Training Random forest regressor model 
# Evaluating both training and testing metrics (RMSE and R2)

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
rf = RandomForestRegressor(n_estimators = 200, max_depth = 20, min_samples_split = 10, random_state = 42) 
rmse_scores = []
r2_scores = []
rmse_train_scores = []
r2_train_scores = []
for train_index, test_index in kf.split(X1):
    X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
    y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]

    rf.fit(X_train, y_train)
 
    y_pred = rf.predict(X_test)

    train_pred = rf.predict(X_train)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  
    r2 = r2_score(y_test, y_pred)

    rmse_train = np.sqrt(mean_squared_error(y_train, train_pred))
    r2_train = r2_score(y_train, train_pred)

    rmse_train_scores.append(rmse_train)
    r2_train_scores.append(r2_train)
    
    rmse_scores.append(rmse)
    r2_scores.append(r2)


print("Training Results")
print(f"Average RMSE: {np.mean(rmse_train_scores)}")
print(f"Average R2: {np.mean(r2_train_scores)}")
print('-------------------------------')
print("Testing Results")
print(f"Average RMSE: {np.mean(rmse_scores)}")
print(f"Average R2: {np.mean(r2_scores)}")

Training Results
Average RMSE: 37.94308805669754
Average R2: 0.796195550579089
-------------------------------
Testing Results
Average RMSE: 58.41090479241058
Average R2: 0.47668418275687463


In [20]:
df_numeric['weighted_score'].mean()

37.476190476190474

In [21]:
df_numeric['weighted_score'].std()

84.37704845076048

In [22]:
df_numeric['weighted_score'].max()

680

In [23]:
df_numeric['weighted_score'].min()

0

In [24]:
# With K-fold cross validation
X = pd.DataFrame(X)
y = pd.Series(y)
# print(y)
rf_model = RandomForestRegressor(n_estimators = 100, random_state = 42) 
kf = KFold(n_splits = 10, shuffle = True, random_state = 42) 

def evaluate_model(X, y, model, kf):
    r2_scores = []
    rmse_scores = [] 

    for train_index, test_index in kf.split(X):
        # print(f"Train indices: {train_index[:5]}...")  
        # print(f"Test indices: {test_index[:5]}...")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        # print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred) 
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        r2_scores.append(r2)
        rmse_scores.append(rmse)

    return np.mean(r2_scores), np.mean(rmse_scores)


mean_r2, mean_rmse = evaluate_model(X, y, rf_model, kf)

print(f"Mean R2: {mean_r2}")
print(f"Mean RMSE: {mean_rmse}")

Mean R2: 0.9847450217743674
Mean RMSE: 9.464171104600657


In [25]:
# print(X.shape)
# print(y.shape)


print(f"Min: {y.min()}")
print(f"Max: {y.max()}")
print(f"Mean value of y: {y.mean()}")
print(f"Standard deviation of y: {y.std()}")


Min: 0
Max: 680
Mean value of y: 37.476190476190474
Standard deviation of y: 84.37704845076048
