In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [2]:
data = pd.read_excel("/home/jovyan/CSCI 5700/Vert_data.xlsx")
data = data.iloc[:-5]

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 27 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         153 non-null    object 
 1   Sex        153 non-null    object 
 2   Age_mean   153 non-null    object 
 3   C2         153 non-null    float64
 4   C3         153 non-null    float64
 5   C4         153 non-null    float64
 6   C5         153 non-null    float64
 7   C6         153 non-null    float64
 8   C7         153 non-null    float64
 9   T1         153 non-null    float64
 10  T2         153 non-null    float64
 11  T3         153 non-null    float64
 12  T4         153 non-null    float64
 13  T5         153 non-null    float64
 14  T6         153 non-null    float64
 15  T7         153 non-null    float64
 16  T8         153 non-null    float64
 17  T9         153 non-null    float64
 18  T10        153 non-null    float64
 19  T11        153 non-null    float64
 20  T12       

In [4]:
genders = ['M', 'F', 'UD']

results = {}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

for gender in genders:
   
    data_gender = data[data['Sex'] == gender].sample(frac=1, random_state=42).reset_index(drop=True)
    

    y = data_gender['Sum_Verts']
    X = data_gender.drop(columns=['Sum_Verts', 'ID', 'Age_mean', 'Sex'])
    
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    mse_scores = cross_val_score(rf, X, y, cv=kf, scoring=mse_scorer)
    r2_scores = cross_val_score(rf, X, y, cv=kf, scoring=r2_scorer)
    
    rf.fit(X, y)
    results[gender] = {
        "Mean Squared Error (Cross-Validation)": -np.mean(mse_scores),
        "R-squared (Cross-Validation)": np.mean(r2_scores)
    }
    
    model_filename = f"random_forest_{gender}.joblib"
    joblib.dump(rf, model_filename)
    
    print(f"Results for {gender}:")
    print(f"Mean Squared Error (Cross-Validation): {-np.mean(mse_scores)}")
    print(f"R-squared (Cross-Validation): {np.mean(r2_scores)}")
    print(f"Model saved as {model_filename}")
    print("-" * 30)

Results for M:
Mean Squared Error (Cross-Validation): 64.7073135599998
R-squared (Cross-Validation): 0.7520434251981494
Model saved as random_forest_M.joblib
------------------------------
Results for F:
Mean Squared Error (Cross-Validation): 52.810869969538814
R-squared (Cross-Validation): 0.8443203223455521
Model saved as random_forest_F.joblib
------------------------------
Results for UD:
Mean Squared Error (Cross-Validation): 200.8912788430054
R-squared (Cross-Validation): 0.32271944754087745
Model saved as random_forest_UD.joblib
------------------------------


In [None]:
# rf_male = joblib.load('random_forest_male.joblib')
# rf_female = joblib.load('random_forest_female.joblib')
# rf_unknown = joblib.load('random_forest_unknown.joblib')