# Generate Summary Statistics for Studies
**Figures/Tables**: Table 1, Supp Table 4  
This notebook generates summary statistics (number of patients, number of visits, months followup, slope) from study data dictionaries

In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
import joblib
import sys
from pathlib import Path

from analysis_utils import * 

In [3]:
def generate_summary_statistics(data_path, data_name, proj_lis):
    """Calculate summ stats including average number of visits, months of followup, slope"""
    df_proj_sum = pd.DataFrame(columns=['dataset', 'study_type', 'total_pats', 'avg_num_vis', 'avg_month_follow', 'avg_slope'])
    study_type_dict = {'aals': 'Observational', 'ceft': 'Clinical Trial', 'emory': 'Observational', 'proact':'Clinical Trial'}
    
    for proj in proj_lis:
        full_data_path = data_path / ('data_{}_{}.pkl').format(proj, data_name)
        assert full_data_path.exists(), 'File does not exist: {}'.format(full_data_path.resolve())
        cur_dat = joblib.load(data_path / ('data_{}_{}.pkl').format(proj, data_name))
        study_type = study_type_dict[proj]

        num_pats = len(cur_dat['SI'])
        XA = cur_dat['XA'][:,1:]
        YA = cur_dat['YA'][:,1:]
        num_visits = np.sum(~np.isnan(XA), axis=1) #exclude anchor onset
        avg_num_visits = "{:.0f} ({:.0f})".format(num_visits.mean(), num_visits.std())

        months_follow = ((np.nanmax(XA, axis=1)-np.nanmin(XA, axis=1))*12)
        avg_months_follow = "{:.2f} ({:.2f})".format(months_follow.mean(), months_follow.std())

        slope_df = calc_slope_mogp_data(cur_dat).dropna(how='any')['slope'] 
        avg_slope = "{:.2f} ({:.2f})".format(slope_df.mean(), slope_df.std())

        df_proj_sum = df_proj_sum.append({'dataset': proj, 'study_type':study_type, 'total_pats': num_pats, 'avg_num_vis': avg_num_visits, 'avg_month_follow':avg_months_follow, 'avg_slope':avg_slope}, ignore_index=True)
    return df_proj_sum

def format_table(df_table, dataset_labels, column_labels):
    df_table = df_table.sort_values(by=['total_pats'], ascending=False)
    df_table['dataset'] = df_table['dataset'].map(dataset_labels)
    df_table = df_table.rename(columns=column_labels)
    return df_table


In [4]:
# Set formatting labels
projects = ['aals', 'ceft', 'emory', 'proact']
dat_lab = {'aals': 'AALS', 'ceft':'CEFT', 'emory':'EMORY', 'proact':'PRO-ACT'}

col_lab = {'dataset':'Dataset', 'study_type':'Study Type', 'total_pats': 'Total No. Participants Included', 
                 'avg_num_vis': 'Mean (SD) No. Visits', 'avg_month_follow': 'Mean (SD) Months Followed', 
                 'avg_slope': 'Mean (SD) ALSFRS-R Slope'}

In [5]:
# Generate summary for ALSFRS-R models with 3 or more visits
data_min3 = Path('data/model_data/1_alsfrsr_all')
sum_data_min3 = generate_summary_statistics(data_min3, 'min3_alsfrst', projects)
sum_data_min3 = format_table(sum_data_min3, dat_lab, col_lab)

sum_data_min3

# Save table
sum_data_min3.to_csv('reports/table_study_summary_stats.csv', index=False)

Unnamed: 0,Dataset,Study Type,Total No. Participants Included,Mean (SD) No. Visits,Mean (SD) Months Followed,Mean (SD) ALSFRS-R Slope
3,PRO-ACT,Clinical Trial,2923,10 (4),11.86 (6.33),-1.09 (0.99)
1,CEFT,Clinical Trial,476,10 (5),19.09 (10.58),-1.20 (0.79)
0,AALS,Observational,456,5 (2),16.64 (8.78),-0.75 (0.65)
2,EMORY,Observational,399,6 (4),19.47 (14.74),-1.21 (1.20)


In [6]:
# Generate summary for PROACT/CEFT models with 10 or more visits
data_min4 = Path('data/model_data/2_sparsity_prediction/prediction')
sum_data_min4 = generate_summary_statistics(data_min4, 'min4_predict_full', ['proact', 'ceft'])
sum_data_min4 = format_table(sum_data_min4, dat_lab, col_lab)
sum_data_min4.insert(1, 'Inclusion Criteria', '>= 4 visits')

data_min10 = Path('data/model_data/2_sparsity_prediction/sparsity')
sum_data_min10 = generate_summary_statistics(data_min10, 'min10_sparse_full', ['proact', 'ceft'])
sum_data_min10 = format_table(sum_data_min10, dat_lab, col_lab)
sum_data_min10.insert(1, 'Inclusion Criteria', '>= 10 visits')

sum_data_agg = sum_data_min4.append(sum_data_min10, ignore_index=True)

sum_data_agg

# Save table
sum_data_agg.to_csv('reports/supp_table_study_summ_stats_predsparse.csv', index=False)

Unnamed: 0,Dataset,Inclusion Criteria,Study Type,Total No. Participants Included,Mean (SD) No. Visits,Mean (SD) Months Followed,Mean (SD) ALSFRS-R Slope
0,PRO-ACT,>= 4 visits,Clinical Trial,2814,10 (4),12.18 (6.24),-1.05 (0.87)
1,CEFT,>= 4 visits,Clinical Trial,453,10 (5),19.79 (10.35),-1.15 (0.71)
2,PRO-ACT,>= 10 visits,Clinical Trial,1327,14 (3),15.55 (6.70),-0.89 (0.65)
3,CEFT,>= 10 visits,Clinical Trial,228,14 (4),26.91 (9.21),-0.81 (0.43)
