In [None]:
#needs a kernel that can load matplotlib, pandas, numpy (does not need to be a microsim kernel)
#if you run this on OSC and the kernel dies as you run it, run the notebook with more than 1 cores

In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
#pd.set_option('max_rows', None)
#pd.set_option('max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:,.4f}'.format
import numpy as np

In [2]:
#resultsDir = "/users/PAS2164/deligkaris/MICROSIM/SIMULATIONS/PRELIMINARY-TRIALS-100-CV-ALL-DEMENTIA-ALL"
resultsDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/SIMULATIONS/PRELIMINARY-TRIALS-100-CV-ALL-DEMENTIA-ALL"
os.chdir(resultsDir)
data=pd.read_csv("inputLog.csv")

In [3]:
#some regression results are nan
data.head() 

Unnamed: 0,reg,se,pvalue,duration,sampleSize,outcome,analysis,dementiaRisk,cvRisk
0,-28.6097,1130140.1029,1.0,3,100,death,logisticRegression-death,0.0,0.0
1,,,,3,100,death,logisticRegression-death,0.0,0.0
2,,,,3,100,death,logisticRegression-death,0.0,0.0
3,,,,3,100,death,logisticRegression-death,0.0,0.0
4,,,,3,100,death,logisticRegression-death,0.0,0.0


In [4]:
data.shape

(600000, 9)

In [None]:
#how many regressions returned nan in absolute number and percent wise 
data["reg"].isna().sum(),data["reg"].isna().sum()/data.shape[0]

In [None]:
# nans tend to be small sample sizes (but not always), results from logistic regression 
data[data["reg"].isna()].sample(20)

In [None]:
#if the result is nan, most likely it is a small sample size
#but from one 20,000 sample trial I get 200 100 sample trials and 5354/598 is a factor of ~10 so smaller 
#sample trials less likely (by a factor of ~20) to return nan in regression
data[data["reg"].isna()]["sampleSize"].value_counts()

In [None]:
data[data["reg"].isna()]["outcome"].value_counts()

In [None]:
#linear regression did not result in nan
data[data["reg"].isna()]["analysis"].value_counts()

In [None]:
data[data["reg"].isna()]["dementiaRisk"].value_counts()

In [None]:
data[data["reg"].isna()]["cvRisk"].value_counts()

In [None]:
#if count() is doing what I think is doing, then small and large cv risks are more likely to return nan 
#in logistic regression
#note the massive 4015 for dementia risk 0.0002 and 0.0061

data[data["reg"].isna()].groupby(["dementiaRisk","cvRisk"]).count()

In [5]:
#from now on keep rows without nan
results = data.dropna(axis=0).copy()

In [6]:
results.sample(10)

Unnamed: 0,reg,se,pvalue,duration,sampleSize,outcome,analysis,dementiaRisk,cvRisk
433769,-0.0473,0.1776,0.7899,20,1000,death,logisticRegression-death,0.0059,0.0061
5572,-0.0238,0.0366,0.5151,15,20000,deathstroke-mi-,logisticRegression-deathstroke-mi-,0.0,0.0
202180,0.15,0.441,0.7338,3,1000,_gcp-last,linearRegression-_gcp-last,0.0002,0.0061
474610,0.0516,0.0354,0.1454,5,5000,_qalys-sum,linearRegression-_qalys-sum,0.0059,0.0132
140161,-0.016,0.1508,0.9155,3,1000,_gcp-mean,linearRegression-_gcp-mean,0.0002,0.0
422150,0.4521,0.432,0.2954,3,1000,deathstroke-mi-dementia-,logisticRegression-deathstroke-mi-dementia-,0.0059,0.0026
297591,0.0127,0.0419,0.7608,15,20000,_gcp-mean,linearRegression-_gcp-mean,0.0013,0.0026
228463,0.4499,0.4802,0.3488,5,200,death,logisticRegression-death,0.0002,0.0132
272301,0.0204,0.0401,0.6108,3,15000,_gcp-mean,linearRegression-_gcp-mean,0.0013,0.0008
386277,0.2565,0.2171,0.2375,3,10000,deathstroke-mi-dementia-,logisticRegression-deathstroke-mi-dementia-,0.0059,0.0008


In [7]:
results.shape

(586236, 9)

In [8]:
outcomes=results["outcome"].unique()
dementiaRisks=results["dementiaRisk"].unique()
cvRisks=results["cvRisk"].unique()
sampleSizes=results["sampleSize"].unique()
durations=results["duration"].unique()

In [9]:
meanReg={}  #dictionary, key depends on outcome, duration and sample size, value is an array with cv=row, dem=column
results["runMeanReg"]=np.nan #initialize the running mean regression column in the results dataframe
runMeanReg = None #initialize temporary pandas series to store the running mean regression

In [None]:
#calculate means and running means for regression
#it takes a few minutes (but not unreasonably long)
for outcome in outcomes:    
    for duration in durations:
        for sampleSize in sampleSizes:
            #initialize array for specific outcome, duration, sample size
            meanReg[f"{outcome},{duration},{sampleSize}"]=np.zeros((len(cvRisks),len(dementiaRisks)))
            for iCvRisk in range(len(cvRisks)):
                for iDementiaRisk in range(len(dementiaRisks)):
                    dfForParameters=results.loc[ #get all relevant data
                                    (results["outcome"]==outcome) & 
                                    (results["sampleSize"]==sampleSize) &
                                    (results["dementiaRisk"]==dementiaRisks[iDementiaRisk]) & 
                                    (results["cvRisk"]==cvRisks[iCvRisk]) &
                                    (results["duration"]==duration) ]
                    regs=dfForParameters["reg"].copy() #get regression coefficients only
                    meanReg[f"{outcome},{duration},{sampleSize}"][iCvRisk,iDementiaRisk]=regs.mean() #store mean
                    #calculate and store running mean
                    if runMeanReg is None:
                        runMeanReg=regs.expanding().mean()
                    else:
                        runMeanReg=pd.concat([runMeanReg,regs.expanding().mean()])

In [None]:
results["runMeanReg"]=runMeanReg #store running means in results dataframe

In [None]:
results.head()

In [None]:
#just a reminder of the values, unsure about the source of nan at the end
outcomes,durations,sampleSizes,cvRisks,dementiaRisks

In [None]:
#note: any plots that include less than 100 data points is due to regression returning nan 
#note: I have not implemented anything yet on dealing with nan, eg resubmitting the calculation
#note: the single dot at the end is just for validation of my running average

#for outcome in outcomes: #plot for all outcomes
for outcome in [outcomes[0]]:   #set a specific outcome
    #for duration in durations: #plot for all durations
    for duration in [durations[1]]: 
        #for sampleSize in sampleSizes: #plot for all sample sizes
        for sampleSize in [sampleSizes[0]]:
            #for iCvRisk in range(len(cvRisks)): #plot for all cv risks
            for iCvRisk in [0]: #set a specific risk, 0-4            
                #for iDementiaRisk in range(len(dementiaRisks)): #plot for all dementia risks
                for iDementiaRisk in [0]: #set a specific dementia risk, 0-4
                    plotData=results.loc[ #get all relevant data
                    (results["outcome"]==outcome) & 
                    (results["sampleSize"]==sampleSize) &
                    (results["dementiaRisk"]==dementiaRisks[iDementiaRisk]) & 
                    (results["cvRisk"]==cvRisks[iCvRisk]) &
                    (results["duration"]==duration), "runMeanReg"].copy()
                    plt.scatter(range(len(plotData)),plotData)
                    plt.scatter(99,meanReg[f"{outcome},{duration},{sampleSize}"][iCvRisk,iDementiaRisk])


In [None]:
for outcome in [outcomes[0]]:    #change to outcomes in order to see all heatmaps associated with all outcomes
    for duration in durations:
        for sampleSize in [sampleSizes[0]]:  #change to sampleSizes to see heatmaps associated with all sample sizes
            print(f"{outcomes[0]},{durations[0]},{sampleSizes[0]}")
            dataForPlot=meanReg[f"{outcomes[0]},{durations[0]},{sampleSizes[0]}"]
            fig, ax = plt.subplots()
            im = ax.imshow(dataForPlot)

            # Show all ticks and label them with the respective list entries
            ax.set_xticks(np.arange(len(cvRisks)), labels=cvRisks)
            ax.set_yticks(np.arange(len(dementiaRisks)), labels=dementiaRisks)
            ax.set(xlabel='cv risk', ylabel='dementia risk')

            # Rotate the tick labels and set their alignment.
            plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

            cbar = ax.figure.colorbar(im, ax=ax)
            cbar.ax.set_ylabel("mean effect size", rotation=-90, va="bottom")

            ax.set_title("mean effect size")
            fig.tight_layout()
            plt.show()