In [None]:
#note: must use a microsim kernel or a kernel that can load all necessary python modules

In [11]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandarallel import pandarallel
import multiprocessing as mp

In [12]:
from microsim.bp_treatment_strategies import *

from microsim.outcome_model_repository import OutcomeModelRepository
from microsim.outcome_model_type import OutcomeModelType
from microsim.person import Person

from microsim.population import NHANESDirectSamplePopulation

from microsim.bp_treatment_strategies import SprintTreatment

from microsim.sim_settings import simSettings

from microsim.trials.trial import Trial
from microsim.trials.trialset import TrialsetParallel, TrialsetSerial
from microsim.trials.trial_description import TrialDescription
from microsim.trials.logistic_regression_analysis import LogisticRegressionAnalysis
from microsim.trials.linear_regression_analysis import LinearRegressionAnalysis
from microsim.trials.outcome_assessor import OutcomeAssessor
from microsim.trials.attribute_outcome_assessor import AttributeOutcomeAssessor
from microsim.trials.attribute_outcome_assessor import AssessmentMethod
from microsim.trials.risk_filter import RiskFilter
from microsim.trials.trial_utils import get_analysis_name, randomizationSchema

In [13]:
#any microsim dir will work, just need to access the NHANES data
microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/CODE/microsim"
os.chdir(microsimDir)

In [14]:
#at the beginning turn on pandarallel if you are planning to use TrialsetSerial
pandarallel.initialize(verbose=1) #microsim by default now does not initialize pandarallel
simSettings.pandarallelFlag = True #with this flag all new population instances will be set to use pandarallel

In [15]:
#with pandarallel on these two steps are faster
pop = NHANESDirectSamplePopulation(10000, 1999)
#alive, df = pop.advance_vectorized(30)

In [16]:
anyEvent2 = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH, OutcomeType.STROKE, OutcomeType.MI, OutcomeType.DEMENTIA, OutcomeAssessor.CI]))
cogEvent = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.CI,OutcomeType.DEMENTIA]))
vascularEventOrDeath = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH, OutcomeType.STROKE, OutcomeType.MI]))
anyEvent = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH, OutcomeType.STROKE, OutcomeType.MI, OutcomeType.DEMENTIA]))
death = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH]))
qalys = LinearRegressionAnalysis(AttributeOutcomeAssessor("_qalys", AssessmentMethod.SUM))
meanGCP = LinearRegressionAnalysis(AttributeOutcomeAssessor("_gcp", AssessmentMethod.MEAN))
lastGCP = LinearRegressionAnalysis(AttributeOutcomeAssessor("_gcp", AssessmentMethod.LAST))

In [17]:
#we will later define how many processes to launch, this is just for validation
#and so that we do not set the number of processes to a number greater than the number of available cores
#print("code has access to cores ",len(os.sched_getaffinity(0)))

In [18]:
#these are usually set from the input file, but to make things easier here 
#just set them on the notebook (which means no script will test for their meaningfulness)
inputSampleSizes = [1000,2000,10000] #for quick tests
#inputSampleSizes = [100, 200, 500, 1000, 5000, 10000, 15000, 20000]
inputDurations = [1,2] #for quick tests
#inputDurations = [3,5,10,15,20]
#inputDemThresholds = [2.4845839854531493e-08, 0.00018576417292080007, 0.0012917270937081809, 0.005870510161620921, 0.025739443157677927]
#inputCvThresholds = [1.167603052003119e-06, 0.0008193743487641601, 0.0026191105926681, 0.006091251406939853, 0.0132184645579298]
#inputDemThresholds = [2.4845839854531493e-08,0.00018576417292080007] #for quick tests
#inputCvThresholds = [1.167603052003119e-06,0.0008193743487641601] #for quick tests
inputDemThresholds = [2.4845839854531493e-08] #for quick tests
inputCvThresholds = [1.167603052003119e-06] #for quick tests
inputTrialsetSize = 3
inputProcesses = 3

In [19]:
#if you try running a TrialsetParallel instance with pandarallel on, you will get an exception
#but these could be included in the TrialsetParallel.run() method
simSettings.pandarallelFlag = False #any new population instance will have pandarallel off 
pop.use_pandarallel(False) #for mp to pickle this instance that used pandarallel, must change its attributes

In [20]:
%%time
#a drawback of including these thresholds outside of the trialset instances is that these cannot be parallelized
#so in every iteration, I have to wait until all trials of the trialset are done in order to proceed to the next
#iteration/trialset (=time lost)
#the more thresholds I have, the greater the time lost 
#but it is relative to have many trials the set has and how many cores I am using
results = None
for dem in inputDemThresholds:
    for cv in inputCvThresholds:
            myRiskFilter = RiskFilter({OutcomeModelType.DEMENTIA : dem , OutcomeModelType.CARDIOVASCULAR : cv})
            #this trial description will be the same for all trials in the trial set
            desc = TrialDescription(sampleSizes= inputSampleSizes,
                                                durations = inputDurations,
                                                inclusionFilter=myRiskFilter.exceedsThresholds,
                                                exclusionFilter=None,
                                                analyses=[anyEvent2,
                                                          cogEvent,
                                                        death,
                                                        anyEvent,
                                                        vascularEventOrDeath,
                                                        qalys,
                                                        meanGCP,
                                                        lastGCP],
                                                treatment=SprintTreatment(),
                                                randomizationSchema=randomizationSchema)
            #create the set
            trialset = TrialsetParallel(desc,
                                        pop,
                                        inputTrialsetSize,
                                        inputProcesses, #an extra argument for TrialsetParallel
                                        additionalLabels=myRiskFilter._outcomeRiskThresholds)
            #trialset = TrialsetSerial(desc,
            #                          pop,
            #                          inputTrialsetSize,
            #                          additionalLabels=myRiskFilter._outcomeRiskThresholds)
            if __name__ ==  '__main__': #launching processes with multiprocesses requires this
                if results is None:
                    results = trialset.run()
                else:
                    results = pd.concat([results,trialset.run()]).reset_index(drop=True)

starting trial 0 now
starting trial 1 now
starting trial 2 now


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS


















A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS


















ending trial 0 now
ending trial 1 now
ending trial 2 now
CPU times: user 315 ms, sys: 57.7 ms, total: 372 ms
Wall time: 1min 37s


In [21]:
results.head()

Unnamed: 0,reg,se,pvalue,intercept,meanUntreated,meanTreated,duration,sampleSize,outcome,analysis,OutcomeModelType.DEMENTIA,OutcomeModelType.CARDIOVASCULAR
0,0.041189,0.04014,0.304833,0.144754,0.536125,0.546352,1,10000,death-stroke-mi-dementia-ci-,logisticRegression-death-stroke-mi-dementia-ci-,2.484584e-08,1e-06
1,0.040361,0.040138,0.314632,0.143956,0.535927,0.545949,1,10000,ci-dementia-,logisticRegression-ci-dementia-,2.484584e-08,1e-06
2,0.526438,0.730571,0.471165,-7.425557,0.000595,0.001008,1,10000,death-,logisticRegression-death-,2.484584e-08,1e-06
3,0.015216,0.632772,0.980816,-6.914334,0.000992,0.001008,1,10000,death-stroke-mi-dementia-,logisticRegression-death-stroke-mi-dementia-,2.484584e-08,1e-06
4,0.015216,0.632772,0.980816,-6.914334,0.000992,0.001008,1,10000,death-stroke-mi-,logisticRegression-death-stroke-mi-,2.484584e-08,1e-06


In [86]:
results.head(50)

Unnamed: 0,reg,se,pvalue,intercept,meanUntreated,meanTreated,duration,sampleSize,outcome,analysis,OutcomeModelType.DEMENTIA,OutcomeModelType.CARDIOVASCULAR,meanDifference,testLinear,diff,oddsControl,oddsTreated,absRiskControl,absRiskTreated
0,0.04118899,0.04014034,0.3048329,0.144754,0.536125,0.546352,1,10000,death-stroke-mi-dementia-ci-,logisticRegression-death-stroke-mi-dementia-ci-,2.484584e-08,1e-06,0.010227,False,0.03096216,1.155755,1.204354,0.5361254,0.546352
1,0.04036099,0.04013825,0.3146324,0.143956,0.535927,0.545949,1,10000,ci-dementia-,logisticRegression-ci-dementia-,2.484584e-08,1e-06,0.010022,False,0.03033873,1.154833,1.202397,0.535927,0.545949
2,0.5264384,0.7305708,0.4711648,-7.425557,0.000595,0.001008,1,10000,death-,logisticRegression-death-,2.484584e-08,1e-06,0.000412,False,0.5260262,0.0005958292,0.001008675,0.0005954744,0.001008
3,0.01521551,0.632772,0.9808161,-6.914334,0.000992,0.001008,1,10000,death-stroke-mi-dementia-,logisticRegression-death-stroke-mi-dementia-,2.484584e-08,1e-06,1.5e-05,False,0.01520031,0.0009934433,0.001008675,0.0009924573,0.001008
4,0.01521551,0.632772,0.9808161,-6.914334,0.000992,0.001008,1,10000,death-stroke-mi-,logisticRegression-death-stroke-mi-,2.484584e-08,1e-06,1.5e-05,False,0.01520031,0.0009934433,0.001008675,0.0009924573,0.001008
5,0.0001924111,0.0004471553,0.6669864,0.999405,0.999405,0.999597,1,10000,_qalys-sum,linearRegression-_qalys-sum,2.484584e-08,1e-06,0.000192,True,8.58184e-15,2.716664,2.717186,0.7309415,0.730979
6,-0.06735917,0.07035068,0.3383496,53.799524,53.799524,53.732164,1,10000,_gcp-mean,linearRegression-_gcp-mean,2.484584e-08,1e-06,-0.067359,True,6.949719e-13,2.316521e+23,2.165621e+23,1.0,1.0
7,-0.1372868,0.1402368,0.3276213,50.878722,50.878722,50.741435,1,10000,_gcp-last,linearRegression-_gcp-last,2.484584e-08,1e-06,-0.137287,True,6.039613e-13,1.248384e+22,1.088241e+22,1.0,1.0
8,0.08143047,0.1276355,0.5234788,0.224944,0.556,0.576,1,1000,death-stroke-mi-dementia-ci-,logisticRegression-death-stroke-mi-dementia-ci-,2.484584e-08,1e-06,0.02,False,0.06143047,1.252252,1.358491,0.556,0.576
9,0.08143047,0.1276355,0.5234788,0.224944,0.556,0.576,1,1000,ci-dementia-,logisticRegression-ci-dementia-,2.484584e-08,1e-06,0.02,False,0.06143047,1.252252,1.358491,0.556,0.576


In [26]:
results["meanDifference"] = results["meanTreated"] - results["meanUntreated"]
results["diffLinear"] = results["reg"] - results["meanDifference"]
results["testLinear"] = (abs(results["diff"]) < 10**(-10))

In [83]:
#check the linear regression results (ignore logistic regression results right now)
results.loc[ (results["outcome"]=="_qalys-sum") | 
             (results["outcome"]=="_gcp-mean") |
            (results["outcome"]=="_gcp-last"), ["testLinear", "reg" ,"meanDifference","diff"] ]

Unnamed: 0,testLinear,reg,meanDifference,diff
5,True,0.0001924111,0.000192,8.58184e-15
6,True,-0.06735917,-0.067359,6.949719e-13
7,True,-0.1372868,-0.137287,6.039613e-13
13,True,4.163336e-16,0.0,4.163336e-16
14,True,0.068,0.068,2.502165e-14
15,True,0.058,0.058,1.706968e-14
21,True,4.163336e-16,0.0,4.163336e-16
22,True,0.086,0.086,3.222422e-14
23,True,0.232,0.232,3.469447e-14
29,True,4.163336e-16,0.0,4.163336e-16


In [85]:
import numpy as np
results["oddsControl"] = np.exp(results["intercept"])
results["oddsTreated"] = np.exp(results["intercept"]+results["reg"])
results["absRiskControl"] = results["oddsControl"] / (1+results["oddsControl"])
results["absRiskTreated"] = results["oddsTreated"] / (1+results["oddsTreated"])

In [93]:
results["diffControl"]= abs(results["absRiskControl"] - results["meanUntreated"]) 
results["diffTreated"]= abs(results["absRiskTreated"] - results["meanTreated"])
results["testLogisticControl"] = (abs(results["diffControl"]) < 10**(-6))
results["testLogisticTreated"] = (abs(results["diffTreated"]) < 10**(-6))

In [94]:
#check the logistic regression results (ignore linear regression results right now)
results.loc[ (results["outcome"]=="death-stroke-mi-dementia-ci-") | 
             (results["outcome"]=="ci-dementia") |
             (results["outcome"]=="death") |
             (results["outcome"]=="death-stroke-mi-"), 
              ["testLogisticTreated","testLogisticControl","diffControl", "diffTreated"] ]

Unnamed: 0,testLogisticTreated,testLogisticControl,diffControl,diffTreated
0,True,True,2.220446e-16,2.220446e-16
4,True,True,1.6046190000000002e-17,1.452831e-17
8,True,True,1.110223e-16,1.110223e-16
12,False,False,,
16,True,True,1.110223e-16,0.0
20,True,True,9.284003e-10,2.770763e-10
24,True,True,1.110223e-16,1.110223e-16
28,True,True,9.283959e-10,2.770756e-10
32,True,True,1.110223e-16,0.0
36,False,False,,


In [23]:
#test a single trial, and its regression results

In [None]:
testTrial=Trial(desc,pop, #an extra argument for TrialsetParallel
              additionalLabels=myRiskFilter._outcomeRiskThresholds)

In [None]:
testTrial.run()

In [None]:
data=death.get_dataframe(testTrial.treatedPop._people.tolist(),testTrial.untreatedPop._people.tolist())

In [None]:
import statsmodels.formula.api as smf
reg = smf.logit("outcome ~ treatment", data).fit(disp=False, method_kwargs={"warn_convergence": False})

In [None]:
reg.params

In [None]:
reg.params["Intercept"]

In [None]:
reg.summary()