In [None]:
#note: must use a microsim kernel or a kernel that can load all necessary python modules

In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandarallel import pandarallel
import multiprocessing as mp

In [2]:
from microsim.bp_treatment_strategies import *

from microsim.outcome_model_repository import OutcomeModelRepository
from microsim.outcome_model_type import OutcomeModelType
from microsim.person import Person

from microsim.population import NHANESDirectSamplePopulation

from microsim.bp_treatment_strategies import SprintTreatment

from microsim.sim_settings import simSettings

from microsim.trials.trial import Trial
from microsim.trials.trialset import TrialsetParallel, TrialsetSerial
from microsim.trials.trial_description import TrialDescription
from microsim.trials.logistic_regression_analysis import LogisticRegressionAnalysis
from microsim.trials.linear_regression_analysis import LinearRegressionAnalysis
from microsim.trials.outcome_assessor import OutcomeAssessor
from microsim.trials.attribute_outcome_assessor import AttributeOutcomeAssessor
from microsim.trials.attribute_outcome_assessor import AssessmentMethod
from microsim.trials.risk_filter import RiskFilter
from microsim.trials.trial_utils import get_analysis_name, randomizationSchema

In [3]:
#any microsim dir will work, just need to access the NHANES data
microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/CODE/microsim"
os.chdir(microsimDir)

In [4]:
#at the beginning turn on pandarallel if you are planning to use TrialsetSerial
pandarallel.initialize(verbose=1) #microsim by default now does not initialize pandarallel
simSettings.pandarallelFlag = True #with this flag all new population instances will be set to use pandarallel

In [5]:
#with pandarallel on these two steps are faster
pop = NHANESDirectSamplePopulation(10000, 1999)
#alive, df = pop.advance_vectorized(30)

In [6]:
anyEvent2 = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH, OutcomeType.STROKE, OutcomeType.MI, OutcomeType.DEMENTIA, OutcomeAssessor.CI]))
cogEvent = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.CI,OutcomeType.DEMENTIA]))
vascularEventOrDeath = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH, OutcomeType.STROKE, OutcomeType.MI]))
anyEvent = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH, OutcomeType.STROKE, OutcomeType.MI, OutcomeType.DEMENTIA]))
death = LogisticRegressionAnalysis(OutcomeAssessor([OutcomeAssessor.DEATH]))
qalys = LinearRegressionAnalysis(AttributeOutcomeAssessor("_qalys", AssessmentMethod.SUM))
meanGCP = LinearRegressionAnalysis(AttributeOutcomeAssessor("_gcp", AssessmentMethod.MEAN))
lastGCP = LinearRegressionAnalysis(AttributeOutcomeAssessor("_gcp", AssessmentMethod.LAST))

In [7]:
#we will later define how many processes to launch, this is just for validation
#and so that we do not set the number of processes to a number greater than the number of available cores
#print("code has access to cores ",len(os.sched_getaffinity(0)))

In [8]:
#these are usually set from the input file, but to make things easier here 
#just set them on the notebook (which means no script will test for their meaningfulness)
inputSampleSizes = [1000,2000,10000] #for quick tests
#inputSampleSizes = [100, 200, 500, 1000, 5000, 10000, 15000, 20000]
inputDurations = [1,2] #for quick tests
#inputDurations = [3,5,10,15,20]
#inputDemThresholds = [2.4845839854531493e-08, 0.00018576417292080007, 0.0012917270937081809, 0.005870510161620921, 0.025739443157677927]
#inputCvThresholds = [1.167603052003119e-06, 0.0008193743487641601, 0.0026191105926681, 0.006091251406939853, 0.0132184645579298]
#inputDemThresholds = [2.4845839854531493e-08,0.00018576417292080007] #for quick tests
#inputCvThresholds = [1.167603052003119e-06,0.0008193743487641601] #for quick tests
inputDemThresholds = [2.4845839854531493e-08] #for quick tests
inputCvThresholds = [1.167603052003119e-06] #for quick tests
inputTrialsetSize = 3
inputProcesses = 3

In [9]:
#if you try running a TrialsetParallel instance with pandarallel on, you will get an exception
#but these could be included in the TrialsetParallel.run() method
simSettings.pandarallelFlag = False #any new population instance will have pandarallel off 
pop.use_pandarallel(False) #for mp to pickle this instance that used pandarallel, must change its attributes

In [10]:
%%time
#a drawback of including these thresholds outside of the trialset instances is that these cannot be parallelized
#so in every iteration, I have to wait until all trials of the trialset are done in order to proceed to the next
#iteration/trialset (=time lost)
#the more thresholds I have, the greater the time lost 
#but it is relative to have many trials the set has and how many cores I am using
results = None
for dem in inputDemThresholds:
    for cv in inputCvThresholds:
            myRiskFilter = RiskFilter({OutcomeModelType.DEMENTIA : dem , OutcomeModelType.CARDIOVASCULAR : cv})
            #this trial description will be the same for all trials in the trial set
            desc = TrialDescription(sampleSizes= inputSampleSizes,
                                                durations = inputDurations,
                                                inclusionFilter=myRiskFilter.exceedsThresholds,
                                                exclusionFilter=None,
                                                analyses=[anyEvent2,
                                                          cogEvent,
                                                        death,
                                                        anyEvent,
                                                        vascularEventOrDeath,
                                                        qalys,
                                                        meanGCP,
                                                        lastGCP],
                                                treatment=SprintTreatment(),
                                                randomizationSchema=randomizationSchema)
            #create the set
            trialset = TrialsetParallel(desc,
                                        pop,
                                        inputTrialsetSize,
                                        inputProcesses, #an extra argument for TrialsetParallel
                                        additionalLabels=myRiskFilter._outcomeRiskThresholds)
            #trialset = TrialsetSerial(desc,
            #                          pop,
            #                          inputTrialsetSize,
            #                          additionalLabels=myRiskFilter._outcomeRiskThresholds)
            if __name__ ==  '__main__': #launching processes with multiprocesses requires this
                if results is None:
                    results = trialset.run()
                else:
                    results = pd.concat([results,trialset.run()]).reset_index(drop=True)

starting trial 0 now
starting trial 1 now
starting trial 2 now


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalBPMedsAddedCapped.loc[totalBPMedsAddedCapped >= BaseTreatmentStrategy.MAX_BP_MEDS] = BaseTreatmentStrategy.MAX_BP_MEDS
A va

ending trial 0 now
ending trial 2 now
ending trial 1 now
CPU times: user 301 ms, sys: 48.6 ms, total: 350 ms
Wall time: 1min 32s


In [None]:
#48 for all
results.loc[results["sampleSize"]==2000]["sampleSize"].count()

In [None]:
results.head()

In [None]:
results.loc[results["sampleSize"]==10000].head()