In [57]:
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
#print("Done")

Done


In [None]:
verts = pd.read_excel("verts.xlsx")
vert_data = verts.iloc[0:153, 3:27]
#vert_data

In [59]:
def total_sampling(data, t):
    # Ensure t is in range
    if t > 23 or t < 1:
        raise ValueError("Sample size t must be between 1 and 23.")
    
    # Sampling number of targets
    sampled_columns = random.sample(range(23), t)
    vert_sampled = data.iloc[:, sampled_columns]
    
    # Everything but sum
    vert_sampled_counter = data.iloc[:, 23:24]
    
    vert_combined = pd.concat([vert_sampled.reset_index(drop=True), vert_sampled_counter.reset_index(drop=True)], axis=1)
    vert_combined.columns = list(vert_combined.columns[:-1]) + ["Sum_Verts"]

    # Train model
    X = vert_combined.iloc[:, :-1]  # All but the last column
    y = vert_combined["Sum_Verts"]
    X = sm.add_constant(X)  # Add a constant for intercept
    lm_model = sm.OLS(y, X).fit()
    
    predictions = lm_model.predict(X)
    residuals = vert_combined["Sum_Verts"] - predictions
    vert_sampling_sse = np.sum(residuals**2)  # Sum of Squared Errors
    vert_sampling_r2 = lm_model.rsquared_adj
    
    vertebrae_names = ["C2", "C3", "C4", "C5", "C6", "C7",
                       "T1", "T2", "T3", "T4", "T5", "T6", "T7", 
                       "T8", "T9", "T10", "T11", "T12", 
                       "L1", "L2", "L3", "L4", "L5"]
    
    variable_set = [1 if name in vert_combined.columns else 0 for name in vertebrae_names]
    # Return model and other stats
    return {
        "model": lm_model,
        "SSE": vert_sampling_sse,
        "R2": vert_sampling_r2,
        "variable_set": variable_set
    }
#print("Done")

Done


In [60]:
results = total_sampling(vert_data, t=10)
print(results['SSE'])
print(results['R2'])

3178.616375050077
0.971657054127795


In [61]:
# Prepare data for prediction
new_data = vert_data.iloc[0, :23].to_frame().T  # Sample new data
new_data.iloc[:, 21:23] = np.nan
#new_data

Unnamed: 0,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,T8,T9,T10,T11,T12,L1,L2,L3,L4,L5
0,41.61,12.92,12.9,12.98,14.43,16.62,18.34,19.6,18.53,19.84,...,21.3,22.17,23.13,23.44,25.25,26.2,27.13,27.29,,


In [62]:
new_data_imputed = new_data.copy()
for i in range(new_data_imputed.shape[1]):
    if new_data_imputed.iloc[:, i].isna().any():
        mean_value = vert_data.iloc[:, i].mean()  # Calculate mean
        new_data_imputed.iloc[:, i] = new_data_imputed.iloc[:, i].fillna(mean_value)  # Replace NA with mean
#new_data_imputed

In [63]:
X_columns = results["model"].model.exog_names  
new_data_imputed = new_data_imputed.reindex(columns=X_columns, fill_value=0)

In [65]:
predictions = results["model"].predict(new_data_imputed)
print(predictions[0])
error = (predictions[0] - verts.iloc[0, 26]) * 100 / verts.iloc[0, 26]
print(error)

508.6145786476078
0.401630275101231
