In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Load the CSV file
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
import seaborn as sns
import math
from scipy.stats import ttest_ind
from statistics import mean, stdev

log_file = "C:\\Users\\ekuep\\Desktop\\ols_logs_aggreg.txt"
csv_file = "C:\\Users\\ekuep\\Desktop\\rsquared_values.csv"
file_path = "C:\\Users\\ekuep\Desktop\\logresults_02-07-24_10-51.csv" 

  file_path = "C:\\Users\\ekuep\Desktop\\logresults_02-07-24_10-51.csv"


In [None]:
#For each participant, applies a regression across change in stance time symmetry to change in similarity score
#Runs through all combinations of algorithms and sensor combinations.

data = pd.read_csv(file_path)
data.columns = ['FilePath', 'Sensor', 'GaitParam', 'Algorithm', 'Participant', 'X', 'Y']
data['X'] = pd.to_numeric(data['X'], errors='coerce')
data['Y'] = pd.to_numeric(data['Y'], errors='coerce')


def percent_difference(values):
    base_value = values.iloc[0]  # Take the first value as base
    return [math.ceil(abs(value - base_value)*100/ base_value) for value in values]

def plot_scatter_and_regress(data, algorithm, participant, sensor):
    subset = data[(data['Algorithm'] == algorithm) & 
                  (data['Participant'] == participant) & 
                  (data['Sensor'] == sensor)]
    print(f'Generating plot and regression for {algorithm}, {participant}, {sensor}, number of data points: {len(subset)}')
    
    if not subset.empty:
        subset['Percent_Difference'] = percent_difference(subset['X'])
        X = subset['Percent_Difference'].values
        subset['Ydiff'] = subset['Y'] - subset['Y'].iloc[0]
        Y = subset['Ydiff'].values
        subset_df = pd.DataFrame({'X': X, 'Y': Y})
        model = ols('Y~X',data=subset_df)
        results = model.fit()

        with open(log_file, 'a') as f:
            f.write(f'Regression results for {algorithm}, {participant}, {sensor}:\n')
            f.write(results.summary().as_text() + '\n\n')
        
        rsquared = results.rsquared
        
        with open(csv_file, 'a') as f:
            f.write(f'{algorithm},{participant},{sensor},{rsquared}\n')    
            
        #print(results.summary2())
        plt.figure()
        plt.scatter(X, Y, label=f'{algorithm}-{participant}-{sensor}')
        sns.regplot(x='X',y='Y',data = subset_df,ci=None)
        plt.text(x=subset_df['X'].min(), y=subset_df['Y'].max(), s=f'$R^2 = {rsquared:.2f}$')
        plt.xticks(X)
        plt.ylabel('Change in Similarity')
        plt.xlabel('% Change in Stance Time Symmetry')
        plt.title(f'Scatter Plot and Regression for {algorithm}, {participant}, {sensor}')
        #plt.savefig(f'C:\\Users\\ekuep\Desktop\\{algorithm}-{participant}-{sensor}.png')
        
    else:
        print(f'No data available for {algorithm}, {participant}, {sensor}')

participants = data['Participant'].unique()
sensors = data['Sensor'].unique()
algorithms = data['Algorithm'].unique()

for participant in participants:
    for sensor in sensors:
        for algorithm in algorithms:
            plot_scatter_and_regress(data, algorithm, participant, sensor)


In [None]:
#Aggregates participants together for a given algorithm and sensor config result. Options for computing SRM, Cohen's d, Welch's t-test, etc.

def calculate_srm(col1, col2):
    # Filter out None values
    valid_indices = ~col1.isna() & ~col2.isna()
    if np.any(valid_indices):
        mean_diff = np.mean(col1[valid_indices] - col2[valid_indices])
        std_diff = np.std((col1[valid_indices] - col2[valid_indices]),ddof=1)
        return mean_diff / std_diff
    else:
        return np.nan  # Return NaN if no valid values are present

def calculate_cohens_d(group1, group2):
    mean_diff = mean(group1) - mean(group2)
    pooled_std = np.sqrt((stdev(group1)**2 + stdev(group2)**2) / 2)
    return mean_diff / pooled_std

# Function to plot scatter plots and regression
def plot_scatter_and_regress(data, algorithm, sensor, csv_file):
    
    csv_file = "C:\\Users\\ekuep\\Desktop\\SRM_values.csv"
    csv_file2 = "C:\\Users\\ekuep\\Desktop\\ttest_values.csv"
    csv_file3 = "C:\\Users\\ekuep\\Desktop\\CohensD_values.csv"
    
    subset = data[(data['Algorithm'] == algorithm) & 
                  (data['Sensor'] == sensor)]
    print(f'Generating plot and regression for {algorithm}, {sensor}, number of data points: {len(subset)}')
    
    if not subset.empty:
        
        df = pd.DataFrame(columns=['0', '3%', '6%', '9%'], index=range(12))  # Initialize outside the loop
        for i, participant in enumerate(subset['Participant'].unique()):
            participant_data = subset[subset['Participant'] == participant].copy()
            participant_data['Percent_Difference'] = percent_difference(participant_data['X'])
            participant_data['Y_diff'] = abs(participant_data['Y'] - participant_data['Y'].iloc[0])*100 / participant_data['Y'].iloc[0]
            Y = participant_data['Y_diff'].values.tolist()
            
            if len(Y) == 3:
                df.loc[i] = Y + [None]  # Add None to make it 4 elements
            elif len(Y) == 4:
                df.loc[i] = Y
        
        print(df)
        srm_results = {}
        t_test_results = {}
        cohen_d_results = {}
        
        start_column = '0'
        for col in df.columns:
            if col != start_column and not df[col].isna().all():  # Ignore '0%' column and columns with all None values
                # Ensure numeric data and drop NaNs
                data1 = df[start_column].dropna().astype(float)
                data2 = df[col].dropna().astype(float)
                
                # Calculate SRM
                srm_value = calculate_srm(data1, data2)
                srm_results[col] = srm_value
                
                # Perform Welch's t-test
                t_stat, p_val = ttest_ind(data1, data2, equal_var=False)
                t_test_results[col] = {'t_statistic': t_stat, 'p_value': p_val}
                
                # Calculate Cohen's d
                cohen_d = calculate_cohens_d(data1, data2)
                cohen_d_results[col] = cohen_d
                       
        #Append results to CSV file
        with open(csv_file, "a") as csv_file:
            for key, value in srm_results.items():
                csv_file.write(f"{algorithm},{sensor},{key},{value}\n") 
        
        with open(csv_file2, "a") as csv_file2:
            for key, value in t_test_results.items():
                csv_file2.write(f"{algorithm},{sensor},{key},{value}\n") 
                
        with open(csv_file3, "a") as csv_file3:
            for key, value in cohen_d_results.items():
                csv_file3.write(f"{algorithm},{sensor},{key},{value}\n") 
                
        mean_values = df.mean()
        sem_values = df.sem()  # Standard error of the mean
        x_labels = df.columns
        plt.figure()
        # plt.errorbar(x_labels[1:], mean_values[1:], yerr=1.96 * sem_values[1:], fmt='o', capsize=15, linewidth=5,markersize=10,capthick=5) #font size 26 -- option used to generate figures for pres.
        plt.errorbar(x_labels[1:], mean_values[1:], yerr=1.96 * sem_values[1:], fmt='o',capsize=5)
        plt.xlabel('%Δ Stance Time Symmetry')
        plt.ylabel('%Δ Similarity Score')
        plt.title(f"Algorithm: {algorithm}, Sensor: {sensor}")
        plt.tick_params(axis='x')
        plt.tick_params(axis='y')
        plt.tight_layout()
        #plt.savefig(f"C:\\Users\\ekuep\\Desktop\\percent_values_meanerrorbars_{algorithm}_{sensor}.png")  # Save plot
        
# Get unique combinations of sensors and algorithms
sensors = data['Sensor'].unique()
algorithms = data['Algorithm'].unique()

# Plot for each combination of sensor and algorithm
for sensor in sensors:
    for algorithm in algorithms:
        plot_scatter_and_regress(data, algorithm, sensor, csv_file)
        