In [43]:
import pandas as pd
import os

# Reload the datasets
joint_angles_df = pd.read_csv(r"C:\Users\conne\OneDrive\Documents\GitHub\openbiomechanics\baseball_pitching\data\full_sig\joint_angles.csv")
poi_metrics_df = pd.read_csv(r"C:\Users\conne\OneDrive\Documents\GitHub\openbiomechanics\baseball_pitching\data\poi\poi_metrics.csv")

# Drop columns related to torque and energy/energy transfer from the POI dataset
cols_to_drop = [col for col in poi_metrics_df.columns if "torque" in col.lower() or "energy" in col.lower()]
poi_metrics_cleaned = poi_metrics_df.drop(columns=cols_to_drop)

# Merge the cleaned POI dataset with the joint angles dataset
merged_cleaned_df = pd.merge(joint_angles_df, poi_metrics_cleaned, on="session_pitch", how="inner")

# Display the first few rows of the cleaned and merged dataset
merged_cleaned_df.head()


Unnamed: 0,session_pitch,time,rear_ankle_angle_x,rear_ankle_angle_y,rear_ankle_angle_z,elbow_angle_x,elbow_angle_y,elbow_angle_z,rear_hip_angle_x,rear_hip_angle_y,...,rear_grf_z_max,rear_grf_mag_max,rear_grf_angle_at_max,lead_grf_x_max,lead_grf_y_max,lead_grf_z_max,lead_grf_mag_max,lead_grf_angle_at_max,peak_rfd_rear,peak_rfd_lead
0,1031_2,0.0,102.3377,24.2866,3.0583,98.5992,0.0,30.0717,-25.8661,-1.2451,...,1497.9859,1736.0368,55.7071,1278.4966,270.7849,2484.8747,2782.958,63.3501,11.6564,170.462
1,1031_2,0.0028,102.4702,24.304,3.1173,98.2178,0.0,31.48,-26.0253,-1.4272,...,1497.9859,1736.0368,55.7071,1278.4966,270.7849,2484.8747,2782.958,63.3501,11.6564,170.462
2,1031_2,0.0056,102.5925,24.3088,3.1824,97.8532,0.0,32.8288,-26.184,-1.5995,...,1497.9859,1736.0368,55.7071,1278.4966,270.7849,2484.8747,2782.958,63.3501,11.6564,170.462
3,1031_2,0.0083,102.6985,24.2957,3.2527,97.5209,0.0,34.0643,-26.3384,-1.754,...,1497.9859,1736.0368,55.7071,1278.4966,270.7849,2484.8747,2782.958,63.3501,11.6564,170.462
4,1031_2,0.0111,102.7835,24.2602,3.3269,97.2333,0.0,35.1428,-26.4854,-1.8843,...,1497.9859,1736.0368,55.7071,1278.4966,270.7849,2484.8747,2782.958,63.3501,11.6564,170.462


In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers


# Load the data
joint_angles_df = pd.read_csv(r"C:\Users\conne\OneDrive\Documents\GitHub\openbiomechanics\baseball_pitching\data\full_sig\joint_angles.csv")
poi_metrics_df = pd.read_csv(r"C:\Users\conne\OneDrive\Documents\GitHub\openbiomechanics\baseball_pitching\data\poi\poi_metrics.csv")

# Merge the datasets on 'session_pitch'
merged_df = pd.merge(joint_angles_df, poi_metrics_df, on=['session_pitch'])

# Filter out columns related to torque or energy/energy transfer
excluded_columns = [col for col in merged_df.columns if 'torque' in col.lower() or 'energy' in col.lower()]
merged_cleaned_df = merged_df.drop(columns=excluded_columns)

# Sample a fraction of the merged dataset, for example 10%
sample_fraction = 0.1
sampled_df = merged_cleaned_df.sample(frac=sample_fraction, random_state=42)

# Extract features and targets from the cleaned data
X_all = sampled_df.drop(columns=["session_pitch", "pitch_speed_mph", "session", "time"])
y_all = sampled_df["pitch_speed_mph"]

# Drop non-numeric columns
X_all_cleaned = X_all.select_dtypes(include=['float64', 'int64'])

# Scaling
scaler_X = StandardScaler()
scaler_Y = StandardScaler()

# Scale the data
X_all_scaled = scaler_X.fit_transform(X_all_cleaned)
y_all_scaled = scaler_Y.fit_transform(y_all.values.reshape(-1, 1))

# Adjusting the max_sequence_length for the entire dataset
max_sequence_length_all = merged_cleaned_df.groupby('session_pitch').size().max()

# Pad sequences for the entire dataset
X_all_padded, y_all_padded = [], []
for session_pitch, group in merged_cleaned_df.groupby('session_pitch'):
    X_group = X_all_scaled[group.index]
    y_group = y_all_scaled[group.index]
    
    padding_length = max_sequence_length_all - len(X_group)
    
    X_group_padded = np.vstack((np.zeros((padding_length, X_group.shape[1])), X_group))
    y_group_padded = np.vstack((np.zeros((padding_length, 1)), y_group))
    
    X_all_padded.append(X_group_padded)
    y_all_padded.append(y_group_padded[-1])  # We take the last value as the target

X_all_padded = np.array(X_all_padded)
y_all_padded = np.array(y_all_padded)

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_all_padded, y_all_padded, test_size=0.2, random_state=42)

# Model definition
model = tf.keras.models.Sequential()

# Masking layer to ignore the padding
model.add(tf.keras.layers.Masking(mask_value=0., input_shape=(X_train.shape[1], X_train.shape[2])))

# LSTM layers
#model.add(tf.keras.layers.LSTM(50, return_sequences=True))
model.add(tf.keras.layers.GRU(50))

# Dense layer to produce the final output
model.add(tf.keras.layers.Dense(1, kernel_regularizer=regularizers.l1(0.01)))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0000000001, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32)

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.show()


IndexError: index 24771 is out of bounds for axis 0 with size 24771