# Humans:

Note: for this code to work, the 'data/humans' folder needs to contain one sub-folder for each performed experiment. In each of these folders, the data for the different N must be contained.

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.special import ndtr
from scipy.optimize import curve_fit

In [10]:
class HumanAnalysisPipeline:
    
    def __init__(self, experiment_name, analysis_type="global"):
        self.experiment_name = experiment_name
        if analysis_type not in ["global", "subject-wise"]:
            raise ValueError("analysis_type must be either 'global' or 'subject-wise'")
        self.analysis_type = analysis_type
    
    
    def read_raw_data(self):
        print("|Reading raw data...")
        
        # Creating an empty dataframe that will be populated:
        humans_complete_raw_df = pd.DataFrame()

        # Define the path to the experiment directory:
        experiment_path = Path(f'./data/humans/{self.experiment_name}')

        if not experiment_path.exists():
            print(f"Experiment directory {experiment_path} does not exist.")
            return

        # Accessing all folders in the specified directory:
        folders = experiment_path.glob('*')
                
        for folder in folders:
            if folder.is_dir():
                print(f"||Entered folder: {folder}")
                
                # Accessing all CSV files in the specified directory:
                files = folder.glob('*.csv')
                
                for file_index, file in enumerate(files):
                    if file.is_file():
                        print(f"|||Reading file: {file}")
                        
                        try:
                            # Reading the CSV file:
                            df = pd.read_csv(file)

                            # Concatenating the dataframes:
                            if self.analysis_type == "subject-wise":
                                df['subject_number'] = file_index + 1   # Assigning a subject number to each file
                                
                                #NOTE: assign subject id instead of number?
                                
                            humans_complete_raw_df = pd.concat([humans_complete_raw_df, df], axis=0, ignore_index=True)
                            
                        except Exception as e:
                            print(f"Error reading file {file}: {e}")
        
        self.humans_complete_raw_df = humans_complete_raw_df
        
        print(self.humans_complete_raw_df)
        
        print("|Completed reading raw data.")
    
    
    
    def calculate_demographics(self):
        print("|Calculating demographics.")

        # isolating survey trials to calculate average age:
        humans_300_survey = self.humans_complete_raw_df[self.humans_complete_raw_df.trial_type == "survey-html-form"]
        # initializations:
        age = []
        female_count = 0
        # accessing ages and storing them:
        for index, row in humans_300_survey.iterrows():
            print(row.response)
            
            try:
                # reading age -> 3rd and 4th to last element of "response" string (includes gender and age)
                age.append(int(row.response[-4:-2]))
            except ValueError:
                print(f"Error reading age for row {index}: Invalid age format")
            
            # incrementing female count:
            if row.response[11:13] == "fe":
                female_count += 1
        
        # calculating average age and total female count:
        try:
            average_age = np.mean(age)
            print(f"Average age: {average_age}")
        except ZeroDivisionError:
            print("Error calculating average age: No valid age data found")
        
        print(f"Female count: {female_count}")
        
        print("|Completed calculating demographics.")
            
     
    def clean_data(self):
        print("|Cleaning data...")
        
        try:
            # cleaning raw data to access only relevant variables:
            # - isolating experiment trials ("canvas-keyboard-response" ones) and dropping irrelevant variables:
            cleaned_df = self.humans_complete_raw_df[self.humans_complete_raw_df.trial_type == "canvas-keyboard-response"]
            # - dropping irrelevant variables:
            cleaned_df = cleaned_df.drop(["timeout","failed_images","failed_audio","failed_video","view_history","trial_index", "time_elapsed","internal_node_id" ,"success","stimulus","graphs_couple"], axis=1).copy()
            # - isolating trials were final responses were given (shuffles have " " as response):
            self.cleaned_data = cleaned_df[cleaned_df['response'].isin(['arrowright','arrowleft'])]
            print(self.cleaned_data)
            print("|Completed cleaning data.")
        except Exception as e:
            print(f"Error cleaning data: {e}")
        
        print("|Completed cleaning data.")
        
        
    def calculate_fraction_correct(self):
        print("|Calculating fraction correct...")
            
        # initializations:
        df_humans_fraction_correct_appended = []
        N_values = self.cleaned_data['graph_size'].unique()
        for N_value in N_values:
            # isolating data of current N value:
            currentN_data = self.cleaned_data[self.cleaned_data.graph_size == N_value]
            # isolating the single values of K:
            K_range = currentN_data['clique_size'].unique()
            for K_value in K_range:
                
                if self.analysis_type == "global":
                    # isolating data of current clique size:
                    currentK_data = currentN_data[currentN_data.clique_size == K_value]
                    # calculating accuracy for current clique size ("correct", when present, is either True or False)
                    fraction_correct_currentK = sum(currentK_data.correct) / sum(~ currentK_data.correct.isna())
                    # appending data to the list:
                    df_humans_fraction_correct_appended.append(pd.DataFrame({'N':[N_value], 'K':[K_value], 'fc':[fraction_correct_currentK]}))
                elif self.analysis_type == "subject-wise":
                    for subject_number in currentN_data['subject_number'].unique():
                        # isolating data of current clique size and subject number:
                        currentK_data = currentN_data[(currentN_data.clique_size == K_value) & (currentN_data.subject_number == subject_number)]
                        # calculating accuracy ("correct", when present, is either True or False)
                        fraction_correct_currentK = sum(currentK_data.correct) / sum(~ currentK_data.correct.isna())
                        # appending data to the list:
                        df_humans_fraction_correct_appended.append(pd.DataFrame({'subject_number':[subject_number],'N':[N_value], 'K':[K_value], 'fc':[fraction_correct_currentK]}))
                else:
                    raise ValueError("Invalid analysis type")

        # creating single df for current N value:   
        df_humans_fraction_correct = pd.concat(df_humans_fraction_correct_appended)
        # creating new variable and adding it to the dataframe:
        df_humans_fraction_correct['K/sqrt(N)'] = df_humans_fraction_correct['K']/np.sqrt(df_humans_fraction_correct['N'])
        # creating new variable and adding it to the dataframe:
        df_humans_fraction_correct['K/N'] = df_humans_fraction_correct['K']/df_humans_fraction_correct['N']
        # saving dataframe:
        self.df_humans_fraction_correct = df_humans_fraction_correct
        # visualizing dataframe:
        display(df_humans_fraction_correct)  
        print("|Completed calculating fraction correct.")      
        
    # def calculate_shuffles(self):
    # TODO: perform analysis on the number of shuffles (starting from raw data)                       
            
    def calculate_fitted_params(self):
        print("|Calculating fitted parameters...")
        
        def sigmoid(k, k0, delta):
            return (1+ndtr((k-k0)/delta))/2 
        
        # empty dataframe where fitted parameters will be stored:
        df_humans_fitted_params_appended = []
        
        # looping through all N values:
        for N_value in self.df_humans_fraction_correct['N'].unique():
            # selecting trials for current N value:
            current_data = self.df_humans_fraction_correct[self.df_humans_fraction_correct['N']==N_value]
            if self.analysis_type == "global":
                # calculating fitted parameters:
                fitted_params = curve_fit(sigmoid, current_data['K'], current_data['fc'], p0=(70, 20))[0]
                # adding fitted parameters to the dataframe:
                df_humans_fitted_params_appended.append(pd.DataFrame({'N': [N_value], 'K0':[fitted_params[0]], 'Delta':[fitted_params[1]]}))
            elif self.analysis_type == "subject-wise":
                # looping through all subjects for current N value:
                for subject in current_data['subject_number'].unique():
                    # accessing data for current subject:
                    current_subject_data = current_data[current_data['subject_number'] == subject]
                    # calculating fitted parameters:
                    fitted_params = curve_fit(sigmoid, current_subject_data['K'], current_subject_data['fc'], p0=(70, 20))[0]
                    # adding fitted parameters to the dataframe:
                    df_humans_fitted_params_appended.append(pd.DataFrame({'subject_number':[subject], 'N': [N_value], 'K0':[fitted_params[0]], 'Delta':[fitted_params[1]]}))

        # concatenating dataframe of fitted parameters:
        df_humans_fitted_params = pd.concat(df_humans_fitted_params_appended)
        # saving dataframe:
        self.df_humans_fitted_params = df_humans_fitted_params
        # visualizing dataframe:
        display(df_humans_fitted_params)
        print("|Completed calculating fitted parameters.")
        
    # def generate_plots(self):
    # TODO: generate boxplot to compare the fraction of correct responses at different N values (larger N -> larger fraction correct)   
        
    def run_pipeline(self):
        self.read_raw_data()
        self.calculate_demographics()
        self.clean_data()
        self.calculate_fraction_correct()
        self.calculate_fitted_params()
        # TODO: return a complete dataframe that can be used for general plots 
        # return humans_complete_df

In [12]:
# Instantiate the pipeline with the experiment name
pipeline = HumanAnalysisPipeline(experiment_name="30-08-2024_mock-data", analysis_type="subject-wise")

# Run the entire pipeline
pipeline.run_pipeline()

|Reading raw data...
||Entered folder: data\humans\30-08-2024_mock-data\N1000
|||Reading file: data\humans\30-08-2024_mock-data\N1000\HUPLACLIP_pilot_8.9.2024_8.45.11.csv
|||Reading file: data\humans\30-08-2024_mock-data\N1000\HUPLACLIP_pilot_8.9.2024_9.7.13.csv
||Entered folder: data\humans\30-08-2024_mock-data\N150
|||Reading file: data\humans\30-08-2024_mock-data\N150\HUPLACLIP_pilot_8.9.2024_9.19.41.csv
|||Reading file: data\humans\30-08-2024_mock-data\N150\HUPLACLIP_pilot_8.9.2024_9.27.46.csv
||Entered folder: data\humans\30-08-2024_mock-data\N400
|||Reading file: data\humans\30-08-2024_mock-data\N400\HUPLACLIP_pilot_8.9.2024_9.45.14.csv
     success timeout failed_images failed_audio failed_video  \
0       True   False            []           []           []   
1        NaN     NaN           NaN          NaN          NaN   
2       True   False            []           []           []   
3        NaN     NaN           NaN          NaN          NaN   
4        NaN     NaN         

Unnamed: 0,N,K,fc,K/sqrt(N),K/N
0,1000.0,300.0,1.0,9.486833,0.3
0,1000.0,267.0,0.916667,8.443281,0.267
0,1000.0,233.0,1.0,7.368107,0.233
0,1000.0,217.0,0.916667,6.862143,0.217
0,1000.0,200.0,0.958333,6.324555,0.2
0,1000.0,183.0,1.0,5.786968,0.183
0,1000.0,167.0,0.958333,5.281004,0.167
0,1000.0,150.0,1.0,4.743416,0.15
0,1000.0,133.0,0.875,4.205829,0.133
0,1000.0,117.0,0.916667,3.699865,0.117


|Completed calculating fraction correct.
|Calculating fitted parameters...


Unnamed: 0,N,K0,Delta
0,1000.0,106.941549,21.539478
0,150.0,41.376802,5.666245
0,400.0,60.670849,5.271773


|Completed calculating fitted parameters.


# Machines:

Note: for this code to work, the 'data/machines' folder needs to contain one sub-folder for each full experiment at one value of N. In each of these folders, the results for the different models are contained.

In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# from cmcrameri import cm
# from statsmodels.formula.api import ols
from scipy.special import ndtr
from scipy.optimize import curve_fit, differential_evolution
# from scipy.stats import linregress, ttest_rel, spearmanr, shapiro, sem
# from pathlib import Path
from math import sqrt
# from math import log, e

import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

In [20]:
class MachinesAnalysisPipeline:

    def __init__(self, experiment_name):
        self.experiment_name = experiment_name

    # defining the function to calculate fitted parameters for all models:
    def calculate_fitted_params(self):
        
        # defining the sigmoid function to fit to the data:
        def sigmoid(k, k0, delta):
            return (1+ndtr((k-k0)/delta))/2
    
        # go to exp folder:
        folder_path = os.path.join('..','results', 'data', self.experiment_name)
        fitted_params_df = pd.DataFrame()  # create an empty dataframe for recap
        
        #TODO:
        # - loop through all folders and read folder name (N value) -> used in final dataframe
        # - in each N folder, loop over all folders and extract fitted params for each model
        # - combine all fitted params in a single dataframe (columns: model, K0, N, Delta)
        
        # loop over all folders in the experiment folder (each folder corresponds to a model)
        for folder_name in os.listdir(folder_path):
            if os.path.isdir(os.path.join(folder_path, folder_name)):
                file_path = os.path.join(folder_path, folder_name, f'{folder_name}_N224_fraction_correct.csv')
                if os.path.isfile(file_path):
                    df_fraction_correct = pd.read_csv(file_path)
                    fitted_values = [curve_fit(sigmoid, df_fraction_correct['clique size'], df_fraction_correct['fraction correct'], p0=(70, 20))[0]]
                    fitted_params_model = pd.DataFrame(fitted_values, columns=['k0', 'delta'])
                    fitted_params_model['model'] = folder_name
                    fitted_params_df = pd.concat([fitted_params_df, fitted_params_model], ignore_index=True)
        
        print(fitted_params_df)
        self.fitted_params_df = fitted_params_df
        
    def run_pipeline(self):
        self.calculate_fitted_params()
        # TODO: return a complete dataframe that can be used for general plots 
        # return machines_complete_df

In [21]:
# Instantiate the pipeline with the experiment name
pipeline = MachinesAnalysisPipeline(experiment_name="2024-09-04_all-steps")

# Run the entire pipeline
pipeline.run_pipeline()

TypeError: MachinesAnalysisPipeline.calculate_fitted_params() missing 1 required positional argument: 'exp_name'

# Combined plot: