In [1]:
import pandas as pd
from scipy import stats

In [2]:
def create_list_df(column_to_check, df_data, columns_with_measures):
    columns_to_data = columns_with_measures + [column_to_check]

    df_to_work = df_data[columns_to_data]
    df_to_work = df_to_work.dropna()

    unique_values = df_to_work[column_to_check].unique()

    list_df = []

    for element in unique_values:
        df_filter = df_to_work.loc[df_to_work[column_to_check] == element]
        list_df.append(df_filter)
    
    return list_df

In [3]:
def compare_distributions_measures(dict_distributions):
    args=[]

    for key in dict_distributions:
        args.append(dict_distributions[key])
    
    response_test = stats.kruskal(*args)
    return response_test

In [4]:
def statistical_comparison (columns_with_measures, column_to_check, list_df):
    matrix_response = []

    for measure in columns_with_measures:
        data_to_process = {}

        for element in list_df:
            value_columns = element[measure].tolist()
            value_category = element[column_to_check].unique()[0]
            data_to_process.update({value_category:value_columns})

        response_test = compare_distributions_measures(data_to_process)
        
        row_response = [measure, response_test[0], response_test[1]]

        if response_test[1] >0.1:
            row_response.append(-1)
        elif response_test[1] <=0.1 and response_test[1] >0.05:
            row_response.append(1)    
        elif response_test[1] <=0.05 and response_test[1] >0.01:
            row_response.append(2)
        else:
            row_response.append(3)

        matrix_response.append(row_response)

    return matrix_response

In [11]:
def process_data_statistical_comparison(dataset, columns_with_measures):
    list_full_comparison = []

    for column in dataset.columns:
        if column not in ['id_name', "N°_diada", 'm1','m2','m3','m4','N°_diada', "average_m", "average_t", "average_d1", "average_d2"]:
            try:
                list_df = create_list_df(column, dataset, columns_with_measures)
                response_full_comparison = statistical_comparison (columns_with_measures, column, list_df)

                df_comparison = pd.DataFrame(response_full_comparison, columns=['measure', 'statistic', 'p-value', 'significance'])
                df_comparison['variable'] = column
                list_full_comparison.append(df_comparison)
            except:
                pass

    df_summary = pd.concat(list_full_comparison, axis=0)
    df_summary = df_summary[df_summary["significance"] != -1]
    df_summary = df_summary.sort_values(by=['significance'], ascending=False)
    return df_summary


In [12]:
df_data = pd.read_csv("../../results_update_process/data_with_features_average.csv")
columns_with_measures = ['m1', 'm2', 'm3', 'm4']
df_summary_full = process_data_statistical_comparison(df_data, columns_with_measures)


In [14]:
df_data.columns

Index(['id_name', 'ADS_CHILD_M', 'ADS_CHILD_V', 'ADS_CHILD_TA', 'ADS_CHILD_SA',
       'ADS_CHILD_A', 'ADS_CHILD_P', 'ADS_MAIN_CAREGIVER_M',
       'ADS_MAIN_CAREGIVER_V', 'ADS_MAIN_CAREGIVER_TA',
       'ADS_MAIN_CAREGIVER_SA', 'ADS_MAIN_CAREGIVER_A', 'ADS_MAIN_CAREGIVER_P',
       'Attachment_ads', 'apego_SSP', 'Apego_dic_ADS', 'Apego_dic_SSP', 'm1',
       'm2', 'm3', 'm4', 'change_attachment_y', 'sensitivity2',
       'cis_total_mean', 'Rango_ITERS_y', 'insecure_bebe2', 'Género_bebé_y',
       'Grupo_étnico_bebé_y', 'average_m', 'average_t', 'average_d1',
       'average_d2'],
      dtype='object')

In [7]:
columns_with_measures = ['average_m', 'average_t']
df_summary_jornada = process_data_statistical_comparison(df_data, columns_with_measures)

In [8]:
columns_with_measures = ['average_d1', 'average_d2']
df_summary_dia = process_data_statistical_comparison(df_data, columns_with_measures)

In [9]:
df_summary_dia

Unnamed: 0,measure,statistic,p-value,significance,variable
0,average_d1,6.547451,0.037865,2,ADS_MAIN_CAREGIVER_P
0,average_d1,6.266636,0.099334,1,ADS_CHILD_TA
0,average_d1,6.387069,0.094224,1,ADS_CHILD_SA
0,average_d1,5.175132,0.075203,1,change_attachment_y
1,average_d2,3.125438,0.077079,1,Rango_ITERS_y


In [10]:
df_summary_jornada

Unnamed: 0,measure,statistic,p-value,significance,variable
1,average_t,6.547451,0.037865,2,ADS_MAIN_CAREGIVER_P
1,average_t,6.266636,0.099334,1,ADS_CHILD_TA
1,average_t,6.387069,0.094224,1,ADS_CHILD_SA
0,average_m,3.240741,0.071828,1,Apego_dic_SSP
1,average_t,5.175132,0.075203,1,change_attachment_y
