In [1]:
import pandas as pd
from scipy import stats


In [2]:
def create_list_df(column_to_check, df_data, columns_with_measures):
    columns_to_data = columns_with_measures + [column_to_check]

    df_to_work = df_data[columns_to_data]
    df_to_work = df_to_work.dropna()

    unique_values = df_to_work[column_to_check].unique()

    list_df = []

    for element in unique_values:
        df_filter = df_to_work.loc[df_to_work[column_to_check] == element]
        list_df.append(df_filter)
    
    return list_df

In [3]:
def compare_distributions_measures(dict_distributions):
    args=[]

    for key in dict_distributions:
        args.append(dict_distributions[key])
    
    response_test = stats.kruskal(*args)
    return response_test


In [4]:
def statistical_comparison (columns_with_measures, column_to_check, list_df):
    matrix_response = []

    for measure in columns_with_measures:
        data_to_process = {}

        for element in list_df:
            value_columns = element[measure].tolist()
            value_category = element[column_to_check].unique()[0]
            data_to_process.update({value_category:value_columns})

        response_test = compare_distributions_measures(data_to_process)
        
        row_response = [measure, response_test[0], response_test[1]]

        if response_test[1] >0.1:
            row_response.append(-1)
        elif response_test[1] <=0.1 and response_test[1] >0.05:
            row_response.append(1)    
        elif response_test[1] <=0.05 and response_test[1] >0.01:
            row_response.append(2)
        else:
            row_response.append(3)

        matrix_response.append(row_response)

    return matrix_response

In [5]:
df_data = pd.read_csv("../relevant_results/input_data_post_filter/data_with_features.csv")
columns_with_measures = ['m1', 'm2', 'm3', 'm4']

list_full_comparison = []

for column in df_data.columns:
    if column not in ['id_name','m1','m2','m3','m4','N°_diada']:
        try:
            list_df = create_list_df(column, df_data, columns_with_measures)
            response_full_comparison = statistical_comparison (columns_with_measures, column, list_df)

            df_comparison = pd.DataFrame(response_full_comparison, columns=['measure', 'statistic', 'p-value', 'significance'])
            df_comparison['variable'] = column
            list_full_comparison.append(df_comparison)
        except:
            pass


In [6]:
df_summary = pd.concat(list_full_comparison, axis=0)
df_summary = df_summary.sort_values(by=['significance'], ascending=False)
df_summary

Unnamed: 0,measure,statistic,p-value,significance,variable
1,m2,11.141729,0.003807,3,ADS_MAIN_CAREGIVER_P
1,m2,6.409259,0.040574,2,change_attachment
2,m3,6.229588,0.044388,2,Grupo_étnico_bebé
0,m1,5.290000,0.021448,2,Género_bebé
1,m2,9.518673,0.023134,2,ADS_CHILD_TA
...,...,...,...,...,...
2,m3,1.460522,0.481783,-1,ADS_MAIN_CAREGIVER_M
1,m2,1.460522,0.481783,-1,ADS_MAIN_CAREGIVER_M
0,m1,2.732222,0.255097,-1,ADS_MAIN_CAREGIVER_M
3,m4,0.561788,0.905122,-1,ADS_CHILD_P


In [7]:
df_summary.to_csv("../relevant_results/significance_evaluation/summary_statistical_evaluation.csv", index=False)