In [None]:
# Filter the relevant columns from both datasets before merging
filtered_joint_angles_df = joint_angles_df.iloc[:, :-5]
filtered_poi_metrics_df = poi_metrics_df[['session_pitch', 'session', 'pitch_speed_mph']]

# Merge the two datasets based on the 'session_pitch' variable
merged_df = pd.merge(filtered_joint_angles_df, filtered_poi_metrics_df, on='session_pitch')

# Re-initialize the dictionary to store the 'root cause' variables for each pitcher
root_cause_dict = {}

# Loop through each unique session (pitcher)
for session in merged_df['session'].unique():
    
    # Filter data for the current session (pitcher)
    session_data = merged_df[merged_df['session'] == session]
    
    # Identify the slowest pitch in the session based on pitch speed
    slowest_pitch = session_data.loc[session_data['pitch_speed_mph'].idxmin()]['session_pitch']
    
    # Calculate the mean and standard deviation for pitch speeds, excluding the slowest pitch
    mean_speed = np.mean(session_data[session_data['session_pitch'] != slowest_pitch]['pitch_speed_mph'])
    std_speed = np.std(session_data[session_data['session_pitch'] != slowest_pitch]['pitch_speed_mph'])
    
    # Calculate the Z-score for the slowest pitch
    z_score_slowest = (session_data.loc[session_data['pitch_speed_mph'].idxmin()]['pitch_speed_mph'] - mean_speed) / std_speed
    
    # If the slowest pitch is significantly slower (Z-score <= -1), then proceed with analysis
    if z_score_slowest <= -1:
        
        # Filter data for all pitches from this pitcher, excluding the slowest one
        all_pitches_data = session_data[session_data['session_pitch'] != slowest_pitch]
        
        # Initialize a dictionary to store the first deviating time for each variable
        first_deviation_dict = {}
        
        # Loop through each biomechanical variable (ignoring the first three columns: session_pitch, time, and session)
        for col in all_pitches_data.columns[3:-1]:
            
            # Calculate the mean and standard deviation for this variable in the early time window (time <= 0.2)
            mean_val = np.mean(all_pitches_data[all_pitches_data['time'] <= 0.2][col])
            std_val = np.std(all_pitches_data[all_pitches_data['time'] <= 0.2][col])
            
            # Filter data for the slowest pitch
            slowest_pitch_data = session_data[session_data['session_pitch'] == slowest_pitch]
            
            # Identify the first time point where this variable deviates significantly from the mean (beyond 2 std devs)
            first_deviation_time = slowest_pitch_data['time'][np.abs(slowest_pitch_data[col] - mean_val) > 2 * std_val].min()
            
            # Store the first deviating time in the dictionary
            first_deviation_dict[col] = first_deviation_time
        
        # Identify the variable that deviates first
        root_cause_variable = min(first_deviation_dict, key=first_deviation_dict.get)
        
        # Store the root cause variable for this pitcher in the main dictionary
        root_cause_dict[session] = root_cause_variable

# Calculate the frequency of each 'root cause' variable across all pitchers
root_cause_frequency = {x: list(root_cause_dict.values()).count(x) for x in root_cause_dict.values()}

root_cause_frequency
