In [1]:
import pandas as pd
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
dhi_df = pd.read_csv('D:/GitHub/overload.ai/datasets/dhi.csv')
faw_df = pd.read_csv('D:/GitHub/overload.ai/datasets/faw.csv')
hw_df = pd.read_csv('D:/GitHub/overload.ai/datasets/hw.csv')
weightlifting_df = pd.read_csv('D:/GitHub/overload.ai/datasets/weightlifting_721_workouts.csv')

# Load the aggregated dataset
data_df = pd.read_csv('D:/GitHub/overload.ai/datasets/data.csv')

# Check the shape of individual datasets and aggregated dataset
individual_shapes = {
    "dhi": dhi_df.shape,
    "faw": faw_df.shape,
    "hw": hw_df.shape,
    "weightlifting": weightlifting_df.shape
}

aggregated_shape = data_df.shape

individual_shapes, aggregated_shape

({'dhi': (5124, 12),
  'faw': (538, 12),
  'hw': (786, 12),
  'weightlifting': (9932, 10)},
 (16380, 14))

In [3]:
concatenated_df = pd.concat([dhi_df, faw_df, hw_df, weightlifting_df], ignore_index=True, sort=False)

is_equivalent = concatenated_df.equals(data_df)

concatenated_columns = concatenated_df.columns
data_columns = data_df.columns

In [4]:
data_df.head()

Unnamed: 0,Participant,Date,Workout Name,Duration,Exercise Name,Set Order,Weight,Reps,Distance,Seconds,Notes,Workout Notes,RPE,Volume
0,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),1,65.0,8,0.0,0,,,,520.0
1,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),2,65.0,9,0.0,0,,,,585.0
2,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),3,65.0,7,0.0,0,,,,455.0
3,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),4,65.0,7,0.0,0,,,,455.0
4,HW,12/4/2023 15:54,Legs,45m,Leg Extension (Machine),1,59.0,8,0.0,0,,,,472.0


In [5]:
descriptive_stats = data_df.describe()

missing_values = data_df.isnull().sum()

descriptive_stats, missing_values

(          Set Order        Weight          Reps      Distance       Seconds  \
 count  16380.000000  16380.000000  16380.000000  16380.000000  16380.000000   
 mean       2.852686    119.966719      9.809096      0.053972      0.638156   
 std        1.733082    115.200526      3.865718      4.109325     22.917945   
 min        1.000000      0.000000      0.000000      0.000000      0.000000   
 25%        1.000000     30.000000      8.000000      0.000000      0.000000   
 50%        3.000000     80.000000     10.000000      0.000000      0.000000   
 75%        4.000000    190.000000     12.000000      0.000000      0.000000   
 max       11.000000   2956.000000     81.000000    363.000000   1260.000000   
 
        RPE        Volume  
 count  0.0  16380.000000  
 mean   NaN    989.501622  
 std    NaN    944.518681  
 min    NaN      0.000000  
 25%    NaN    320.000000  
 50%    NaN    720.000000  
 75%    NaN   1500.000000  
 max    NaN  17736.000000  ,
 Participant          0
 

In [6]:
data_cleaned = data_df.drop(columns=['Duration', 'Notes', 'Workout Notes', 'RPE'])
data_cleaned = data_cleaned[data_cleaned['Weight'] <= 1000]

X = data_cleaned.drop(columns=['Participant','Date', 'Workout Name', 'Exercise Name', 'Weight'])
y = data_cleaned['Weight']

# Splitting data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_val.shape


((13102, 5), (3276, 5))

In [7]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Trees": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

# Train and evaluate models
results = {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Predict
    predictions = model.predict(X_val)
    # Evaluate
    mae = mean_absolute_error(y_val, predictions)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    results[name] = {"MAE": mae, "RMSE": rmse}

results

{'Linear Regression': {'MAE': 40.531143675080244, 'RMSE': 502.1120831998039},
 'Decision Trees': {'MAE': 0.522896053943834, 'RMSE': 10.279593294364856},
 'Random Forest': {'MAE': 0.5888351349619403, 'RMSE': 9.587946744778487},
 'Gradient Boosting': {'MAE': 4.173540135866479, 'RMSE': 9.609182499328664}}

In [8]:
def forward_selection(X_train, y_train, X_val, y_val):
    remaining_features = list(X_train.columns)
    selected_features = []
    current_score, best_new_score = float('inf'), float('inf')
    
    while remaining_features and current_score == best_new_score:
        scores_with_candidates = []
        
        for candidate in remaining_features:
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train[selected_features + [candidate]], y_train)
            predictions = model.predict(X_val[selected_features + [candidate]])
            score = mean_absolute_error(y_val, predictions)
            scores_with_candidates.append((score, candidate))
        
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates[0]
        
        if best_new_score < current_score:
            remaining_features.remove(best_candidate)
            selected_features.append(best_candidate)
            current_score = best_new_score

    return selected_features, current_score

selected_features, best_score = forward_selection(X_train, y_train, X_val, y_val)

selected_features, best_score


(['Volume', 'Reps', 'Seconds'], 0.5496440405796461)

In [9]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    mae = mean_absolute_error(y_val, predictions)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    r2 = r2_score(y_val, predictions)
    return mae, rmse, r2

new_results = {}
for name, model in models.items():
    mae, rmse, r2 = evaluate_model(model, X_train[selected_features], y_train, X_val[selected_features], y_val)
    new_results[name] = {"MAE": mae, "RMSE": rmse, "R^2": r2}

new_results

{'Linear Regression': {'MAE': 27.515797747487564,
  'RMSE': 40.964971746116845,
  'R^2': 0.8698368488558879},
 'Decision Trees': {'MAE': 0.47175328996337007,
  'RMSE': 10.213908750607928,
  'R^2': 0.9919081712720691},
 'Random Forest': {'MAE': 0.5386855040028495,
  'RMSE': 9.701857133916095,
  'R^2': 0.9926991658175314},
 'Gradient Boosting': {'MAE': 3.9757373706062964,
  'RMSE': 9.635072668991768,
  'R^2': 0.9927993330572193}}

In [12]:
data_df

Unnamed: 0,Participant,Date,Workout Name,Duration,Exercise Name,Set Order,Weight,Reps,Distance,Seconds,Notes,Workout Notes,RPE,Volume
0,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),1,65.0,8,0.0,0,,,,520.0
1,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),2,65.0,9,0.0,0,,,,585.0
2,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),3,65.0,7,0.0,0,,,,455.0
3,HW,12/4/2023 15:54,Legs,45m,Squat (Barbell),4,65.0,7,0.0,0,,,,455.0
4,HW,12/4/2023 15:54,Legs,45m,Leg Extension (Machine),1,59.0,8,0.0,0,,,,472.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16375,DHI,9/7/2023 14:54,Recomp Full-Body,2h 10m,Lateral Raise (Dumbbell),4,7.0,15,0.0,0,,,,105.0
16376,DHI,9/7/2023 14:54,Recomp Full-Body,2h 10m,Bicep Curl (Dumbbell),1,23.0,15,0.0,0,,,,345.0
16377,DHI,9/7/2023 14:54,Recomp Full-Body,2h 10m,Bicep Curl (Dumbbell),2,32.0,10,0.0,0,,,,320.0
16378,DHI,9/7/2023 14:54,Recomp Full-Body,2h 10m,Bicep Curl (Dumbbell),3,32.0,10,0.0,0,,,,320.0


In [13]:
encoder = LabelEncoder()
data_df['Exercise_encoded'] = encoder.fit_transform(data_df['Exercise Name'])

In [16]:
X = data_df[['Volume', 'Reps', 'Seconds', 'Exercise_encoded']]
y = data_df['Weight']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Exercise_encoded


In [18]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

model_path = "D:/GitHub/overload.ai/overloadai/random_forest_model.joblib"
joblib.dump(model, model_path)

model_path

'D:/GitHub/overload.ai/overloadai/random_forest_model.joblib'

In [19]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X[selected_features], y)

model_path = "D:/GitHub/overload.ai/overloadai/random_forest_model2.joblib"
joblib.dump(rf_model, model_path)

model_path

'D:/GitHub/overload.ai/overloadai/random_forest_model2.joblib'