In [16]:
import pandas as pd
import numpy as np
import joblib
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [17]:
data = pd.read_excel("/home/jovyan/CSCI 5700/Vert_data.xlsx")
data = data.iloc[:-5]

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 27 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         153 non-null    object 
 1   Sex        153 non-null    object 
 2   Age_mean   153 non-null    object 
 3   C2         153 non-null    float64
 4   C3         153 non-null    float64
 5   C4         153 non-null    float64
 6   C5         153 non-null    float64
 7   C6         153 non-null    float64
 8   C7         153 non-null    float64
 9   T1         153 non-null    float64
 10  T2         153 non-null    float64
 11  T3         153 non-null    float64
 12  T4         153 non-null    float64
 13  T5         153 non-null    float64
 14  T6         153 non-null    float64
 15  T7         153 non-null    float64
 16  T8         153 non-null    float64
 17  T9         153 non-null    float64
 18  T10        153 non-null    float64
 19  T11        153 non-null    float64
 20  T12       

In [19]:
data.head()

Unnamed: 0,ID,Sex,Age_mean,C2,C3,C4,C5,C6,C7,T1,...,T9,T10,T11,T12,L1,L2,L3,L4,L5,Sum_Verts
0,korea1,M,21,41.61,12.92,12.9,12.98,14.43,16.62,18.34,...,22.17,23.13,23.44,25.25,26.2,27.13,27.29,28.76,29.45,506.58
1,korea2,M,21,37.56,13.35,12.73,12.64,12.69,14.7,16.89,...,20.92,20.44,21.27,23.78,24.05,22.85,24.54,24.82,25.11,462.9
2,korea3,M,21,36.63,14.61,13.09,11.88,11.79,13.29,15.99,...,22.33,23.71,24.18,25.39,26.44,28.41,29.44,29.71,30.41,496.28
3,korea4,F,40,36.23,12.59,10.64,11.59,10.73,13.42,14.79,...,19.87,20.38,20.88,22.41,22.05,24.47,24.36,24.5,24.2,438.69
4,korea5,M,50,39.38,13.19,13.23,13.75,13.32,17.65,17.9,...,22.36,22.0,21.85,23.38,24.5,25.56,27.22,28.49,29.05,501.87


In [20]:
genders = ['M', 'F', 'UD']

results = {}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

for gender in genders:
    if gender == 'UD':
        data_gender = data.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the entire dataset
    else:
        data_gender = data[data['Sex'] == gender].sample(frac=1, random_state=42).reset_index(drop=True)
    
    y = data_gender['Sum_Verts']
    X = data_gender.drop(columns=['Sum_Verts', 'ID', 'Age_mean', 'Sex'])
    
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    mse_scores = cross_val_score(rf, X, y, cv=kf, scoring=mse_scorer)
    r2_scores = cross_val_score(rf, X, y, cv=kf, scoring=r2_scorer)
    
    rf.fit(X, y)
    results[gender] = {
        "Mean Squared Error (Cross-Validation)": -np.mean(mse_scores),
        "R-squared (Cross-Validation)": np.mean(r2_scores)
    }
    
    model_filename = f"random_forest_{gender}.joblib"
    joblib.dump(rf, model_filename)
    
    print(f"Results for {gender}:")
    print(f"Mean Squared Error (Cross-Validation): {-np.mean(mse_scores)}")
    print(f"R-squared (Cross-Validation): {np.mean(r2_scores)}")
    print(f"Model saved as {model_filename}")
    print("-" * 30)

Results for M:
Mean Squared Error (Cross-Validation): 64.7073135599998
R-squared (Cross-Validation): 0.7520434251981494
Model saved as random_forest_M.joblib
------------------------------
Results for F:
Mean Squared Error (Cross-Validation): 52.810869969538814
R-squared (Cross-Validation): 0.8443203223455521
Model saved as random_forest_F.joblib
------------------------------
Results for UD:
Mean Squared Error (Cross-Validation): 48.23677977253375
R-squared (Cross-Validation): 0.936359400615765
Model saved as random_forest_UD.joblib
------------------------------


In [21]:
#### save the three models
rf_male = joblib.load('random_forest_M.joblib')
rf_female = joblib.load('random_forest_F.joblib')
rf_unknown = joblib.load('random_forest_UD.joblib')

In [22]:
numeric_columns = data.select_dtypes(include=[np.number]).columns
data_numeric = data[numeric_columns]

In [73]:
### save the mean value
mean_values_by_gender = data.groupby('Sex')[numeric_columns].mean()
overall_mean = data[numeric_columns].mean()
mean_values_dict = pd.DataFrame({
    'M': mean_values_by_gender.loc['M'],
    'F': mean_values_by_gender.loc['F'],
    'UD': overall_mean    ####### Here we use the mean of overall to fill the nan value for the UD gender.
})
mean_values_dict = mean_values_dict.T

In [74]:
joblib.dump(mean_values_dict, "mean_values.joblib")

['mean_values.joblib']

In [75]:
mean_values_by_gender = joblib.load("mean_values.joblib")
mean_values_by_gender

Unnamed: 0,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,T9,T10,T11,T12,L1,L2,L3,L4,L5,Sum_Verts
M,38.350133,14.443867,13.7024,13.338533,13.5248,15.1884,17.455333,18.846267,18.731067,19.045333,...,21.604,22.171867,22.878533,24.2592,25.6096,26.0352,27.0888,27.6928,27.533067,489.1068
F,35.174769,12.663538,12.220769,11.939077,12.202,14.031692,15.561385,17.004462,16.965385,17.307692,...,19.559385,20.143077,20.943692,22.727846,23.837077,24.794923,25.333231,25.639385,25.346462,447.286462
UD,36.916078,13.659216,13.057778,12.706928,12.935098,14.669739,16.573137,17.998889,17.957124,18.272157,...,20.720719,21.331503,22.046993,23.565033,24.824314,25.482614,26.328301,26.791176,26.505359,470.556078


#### After the model and mean are saved, they can be directly called in subsequent use

In [76]:
def predict_sum_verts(user_input, user_gender):
    if user_gender not in ['M', 'F', 'UD']:
        raise ValueError("Invalid gender! Please input 'M', 'F', or 'UD'.")

    
    for key, value in user_input.items():
        if pd.notnull(value) and value <= 0:
            print(f"Error: {key} has a value of {value}, which is not greater than 0. Prediction aborted.")
            return None
#####################################################################################################################
    ##### If you need to contorl the input to have different decimal places, please use this part
    
    # for key in user_input:
    #     if not pd.isnull(user_input[key]): 
    #         user_input[key] = round(float(user_input[key]), 2)     #### Convert input into two decimal palces.
#####################################################################################################################
    
    non_null_features = [key for key, value in user_input.items() if not pd.isnull(value)]
    if len(non_null_features) < 14:
        print("Insufficient data: Please provide at least 14 non-null features for accurate prediction.")
    
    gender_means = mean_values_by_gender.loc[user_gender]
    
    spine_features = ['C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6',
                      'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'L1', 'L2', 'L3', 'L4', 'L5']

    for feature in spine_features:
        if feature not in user_input or pd.isnull(user_input[feature]):
            user_input[feature] = gender_means[feature]
    
    input_df = pd.DataFrame([user_input], columns=spine_features)
    
    if user_gender == 'M':
        model = rf_male
    elif user_gender == 'F':
        model = rf_female
    else:
        model = rf_unknown
    
    prediction = model.predict(input_df)
    
    return round(prediction[0], 2)     #### Convert output into two decimal. If you don't need it, please return prediction[0].

In [77]:
def mask_user_input(user_input, mask_fraction=0.3):
    features_to_mask = random.sample(list(user_input.keys()), int(len(user_input) * mask_fraction))
    for feature in features_to_mask:
        user_input[feature] = np.nan
    return user_input

all_mse = []

for index, test_user in data.iterrows():
    test_gender = test_user['Sex']  
    user_input_data = {
        'C2': test_user['C2'],
        'C3': test_user['C3'],
        'C4': test_user['C4'],
        'C5': test_user['C5'],
        'C6': test_user['C6'],
        'C7': test_user['C7'],
        'T1': test_user['T1'],
        'T2': test_user['T2'],
        'T3': test_user['T3'],
        'T4': test_user['T4'],
        'T5': test_user['T5'],
        'T6': test_user['T6'],
        'T7': test_user['T7'],
        'T8': test_user['T8'],
        'T9': test_user['T9'],
        'T10': test_user['T10'],
        'T11': test_user['T11'],
        'T12': test_user['T12'],
        'L1': test_user['L1'],
        'L2': test_user['L2'],
        'L3': test_user['L3'],
        'L4': test_user['L4'],
        'L5': test_user['L5'],
    }
    actual_sum_verts = test_user['Sum_Verts']
    masked_user_input = mask_user_input(user_input_data)
    predicted_sum_verts = predict_sum_verts(masked_user_input, test_gender)
    mse = mean_squared_error([actual_sum_verts], [predicted_sum_verts])
    all_mse.append(mse)
average_mse = np.mean(all_mse)
print(f"Average Mean Squared Error across all users: {average_mse}")

Average Mean Squared Error across all users: 54.34506274509809


In [78]:
test_user = data.iloc[0]  
test_gender = test_user['Sex'] 

user_input_data = {
    'C2': test_user['C2'],
    'C3': test_user['C3'],
    'C4': test_user['C4'],
    'C5': test_user['C5'],
    'C6': test_user['C6'],
    'C7': test_user['C7'],
    'T1': test_user['T1'],
    'T2': test_user['T2'],
    'T3': test_user['T3'],
    'T4': test_user['T4'],
    'T5': test_user['T5'],
    'T6': test_user['T6'],
    'T7': test_user['T7'],
    'T8': test_user['T8'],
    'T9': test_user['T9'],
    'T10': test_user['T10'],
    'T11': test_user['T11'],
    'T12': test_user['T12'],
    'L1': test_user['L1'],
    'L2': test_user['L2'],
    'L3': test_user['L3'],
    'L4': test_user['L4'],
    'L5': test_user['L5'],
}

actual_sum_verts = test_user['Sum_Verts']
predicted_sum_verts = predict_sum_verts(user_input_data, test_gender)
mse = mean_squared_error([actual_sum_verts], [predicted_sum_verts])
print(f"Actual Sum_Verts: {actual_sum_verts}")
print(f"Predicted Sum_Verts: {predicted_sum_verts}")
print(f"Mean Squared Error: {mse}")

Actual Sum_Verts: 506.58
Predicted Sum_Verts: 505.39
Mean Squared Error: 1.4160999999999946


In [79]:
user_input_example = {
    'C2': 41.5,
    'C3': 13.6,
    'T5': 22.3,
    'L1': 16.9,
    'L2': 16.9,
    'L3': 14.9,
    'L4': 24,
    'L5': 24,
}

user_gender_example = 'M'  

predicted_sum_verts = predict_sum_verts(user_input_example, user_gender_example)
print(f"Predicted Sum_Verts for the user: {predicted_sum_verts}")

Insufficient data: Please provide at least 14 non-null features for accurate prediction.
Predicted Sum_Verts for the user: 487.75
