In [81]:
import numpy as np
import pandas as pd
from pathlib import Path

import pingouin as pg

#### Set up paths

In [2]:
code_dir = Path.cwd()
statistics_dir = code_dir.parent
source_dir = statistics_dir.joinpath('sourcedata')
output_dir = statistics_dir.joinpath('output')

#### Import data frame

In [23]:
statistics_data = pd.read_csv(source_dir.joinpath('postcovid_matched_cohort.csv'), index_col = "sub_id", na_values=['NP', ''])


In [24]:
statistics_data

Unnamed: 0_level_0,age,sex,years_of_education,hypertension,diabetes,hyperlipidemia,smoking_ever,cohort,tmt_a,tmt_b,word_list_recall,animal_naming_test,mini_mental_state,distance,weights,subclass
sub_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
sub-002144a4,46,1,14.000000,1,0,0,1,0,31.0,61.0,10.0,24.0,30.0,0.413564,1,117
sub-003db57e,51,0,18.000000,1,0,0,1,1,45.0,74.0,9.0,30.0,29.0,0.402720,1,1
sub-0171efa5,71,0,15.000000,1,0,0,1,0,,,3.0,16.0,28.0,0.159342,1,178
sub-018ad760,51,0,16.000000,0,1,0,0,1,31.0,55.0,10.0,29.0,28.0,0.497247,1,112
sub-026201f6,61,0,19.000000,1,0,0,0,1,42.0,64.0,9.0,25.0,28.0,0.343569,1,147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-fd8e4d9c,53,1,13.000000,0,0,0,1,1,31.0,66.0,9.0,34.0,28.0,0.309960,1,138
sub-fe3db72a,62,0,18.000000,1,0,0,0,0,21.0,58.0,8.0,26.0,28.0,0.310253,1,150
sub-fec1d802,47,0,20.000000,1,0,0,1,0,,,10.0,29.0,28.0,0.501172,1,180
sub-fee2bf9b,55,0,15.130799,0,0,1,0,0,,,,,,0.444681,1,174


In [None]:
pg.ancova(statistics_data)

##### Define statistics function

In [148]:
def stats_2groups(df,columns,stats,group_col,covar=None):



# df: dataframe with statistics

# columns: columns to describe

# stats: statistical tests to apply to respective columns (length has to match)

# group_col: columns that differentiates groups

# covar: covariates for ancova



    from scipy.stats import ttest_ind,chi2_contingency,mannwhitneyu

    from statsmodels.stats.multitest import multipletests

    from pingouin import ttest,mwu

    values_group = df[group_col].unique()

    values_group = [int(x) for x in values_group if not pd.isnull(x)]



    data_controls=df[df[group_col]==0]

    data_patients=df[df[group_col]==1]





    df_control_describe = data_controls.describe()

    df_pat_describe = data_patients.describe()

    target_df=pd.DataFrame(columns=columns)

    for idx,col in enumerate(columns):


        stat = stats[idx]



        if df[col].dtypes == "object":


            values = df[col].unique()

            values = [x for x in values if not pd.isnull(x)]

            n_contr_1 = data_controls[data_controls[col] == values[0]].shape[0]

            n_contr_2 = data_controls[data_controls[col] == values[1]].shape[0]

            target_df.loc[f"contr_percent",col]= n_contr_1 / (n_contr_1 + n_contr_2)

            n_pat_1 = data_patients[data_patients[col] == values[0]].shape[0]

            n_pat_2 = data_patients[data_patients[col] == values[1]].shape[0]

            target_df.loc[f"pat_percent",col]= n_pat_1 / (n_pat_1 + n_pat_2)

            chi_stat, p_val, _, _ = chi2_contingency(pd.crosstab(df[group_col],df[col]).T)

            target_df.loc[f"stat",col] = chi_stat

            target_df.loc[f"pval",col] = p_val

        else:


            target_df.loc["contr_mean",col]=df_control_describe[col]["mean"]

            target_df.loc["contr_std",col]=df_control_describe[col]["std"]

            target_df.loc["contr_count",col]=df_control_describe[col]["count"]

            target_df.loc["contr_median",col]=df_control_describe[col]["50%"]

            target_df.loc["contr_IQR25",col]=df_control_describe[col]["25%"]

            target_df.loc["contr_IQR75",col]=df_control_describe[col]["75%"]
            
            target_df.loc["",col]="---"
            
            target_df.loc["pat_mean",col]=df_pat_describe[col]["mean"]

            target_df.loc["pat_std",col]=df_pat_describe[col]["std"]

            target_df.loc["pat_count",col]=df_pat_describe[col]["count"]

            target_df.loc["pat_median",col]=df_pat_describe[col]["50%"]

            target_df.loc["pat_IQR25",col]=df_pat_describe[col]["25%"]

            target_df.loc["pat_IQR75",col]=df_pat_describe[col]["75%"]


        if stat == "ttest": 

            stat_df = ttest(data_controls[col], data_patients[col])
            p_val, statistic = float(stat_df.loc["T-test","p-val"]), float(stat_df.loc["T-test","T"])

        if stat == "mwu": 

            stat_df = mwu(x=data_controls[col],y=data_patients[col])
            p_val, statistic = float(stat_df.loc["MWU","p-val"]), float(stat_df.loc["MWU","RBC"])

        if stat == "ancova": 
         
            stat_df = pg.ancova(df, dv=col, between=group_col, covar=covar)
            p_val, statistic = float(stat_df.loc[0,"p-unc"]), float(stat_df.loc[0,"F"])
            target_df.loc["covariates",col]=covar
            target_df.loc["main_effect",col]=group_col

        target_df.loc[" ",col]="---"

        target_df.loc["stat",col]=statistic

        target_df.loc["pval",col]=p_val

        target_df.loc['p_bonferroni',col] = target_df.loc['pval',col] * len(columns)

        target_df.loc["p_fdr"] = multipletests(target_df.loc["pval"], alpha=0.05, method="fdr_bh")[1]

        target_df.loc['p_bonferroni'][target_df.loc['p_bonferroni']>1] = 1



    print("n (total) controls: ", data_controls.shape[0])

    print("n (total) patients: ", data_patients.shape[0])



    return target_df



##### ANCOVAs for cognitive and quality of life scores

In [149]:
cognitive_stats = stats_2groups(statistics_data,['tmt_a', 'tmt_b', 'animal_naming_test', 'word_list_recall', 'mini_mental_state'], ['ancova', 'ancova', 'ancova', 'ancova', 'ancova'], 'cohort', ['age', 'sex', 'years_of_education'])
cognitive_stats.to_csv(output_dir.joinpath('postcovid_cognitive_scores_statistics.csv'))

n (total) controls:  223
n (total) patients:  223
