In [1]:
import os
import copy
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandarallel import pandarallel
import numpy as np
import matplotlib.pyplot as plt
import importlib.util

In [2]:
from microsim.population import Population, NHANESDirectSamplePopulation, build_people_using_nhanes_for_sampling
from microsim.sim_settings import simSettings
from microsim.outcome_model_repository import OutcomeModelRepository
from microsim.qaly_assignment_strategy import QALYAssignmentStrategy
from microsim.cohort_risk_model_repository import CohortRiskModelRepository
from microsim.person import Person
from microsim.education import Education
from microsim.gender import NHANESGender
from microsim.race_ethnicity import NHANESRaceEthnicity
from microsim.smoking_status import SmokingStatus
from microsim.alcohol_category import AlcoholCategory

from typing import Callable

In [3]:
#pandarallel.initialize(verbose=1) #microsim by default now does not initialize pandarallel
#simSettings.pandarallelFlag = True #with this flag all new population instances will be set to use pandarallel

#if every person object will have their own rng stream, then perhaps this can be pushed to be initialized
#in every population
#care must be taken if the population is to advance using multiprocessing or not though
#in a similar way that is currently done with trial sets
seedSequence = np.random.SeedSequence()
rngStream = np.random.default_rng(seed=seedSequence)

microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/CODE/microsim"
os.chdir(microsimDir)

In [4]:
#age is a risk factor so it will be included with the other risk factors and
#it needs a model 
class AgeModel:
    
    def __init__(self):
        pass
    
    def estimate_next_risk(self, person, rng=None):
        return person._age[-1]+1
    
#cohortModule = importlib.import_module("microsim.cohort_risk_model_repository")

class myCohortRiskModelRepository(CohortRiskModelRepository):
    def __init__(self):
        super().__init__()
        self._repository["age"] = AgeModel()
        
#setattr(cohortModule, "CohortRiskModelRepository", myCohortRiskModelRepository)

In [5]:
#these are lists that do not change over the course of a population and are the same no matter the population
#treatment strategies
#I could argue that these might be better suited under the Person class, these are just the static attributes all
#person objects will have no matter the population
#the rules that dictate how advancement is done will vary from population to population so the population needs to be
#able to dictate them to person objects
#but since the person advance methods need these rules in order to work, person objects should have at least a default
#set of rules (now they do not)
#when person objects get these rules from a population class, memory pointers will be used (I think)
#but it is unclear to me how this would work with multiprocessing 
#if this can work with multiprocessing, an approach could be: each person object has default set of rules,
#and those are modified by the population, with each person object keeping a reference to the population rules
#as a person attribute
    
#also, I am dividing the person attributes in groups....so that each group will have its own
#requirements (eg a dynamic risk factor need to be associated with an estimate next risk method)
#and the set of all groups should provide ideally all, but at least most of, a person object's attributes

#I would also like to include the units for all attributes here in the source

#A person object is essentially: 
#.     1) its state which consists of both past and present, 
#.     2) rules for aging that state, 
#.     3) tools for analyzing/reporting the state
#A population object is a collection of person objects therefore is: 
#.     1) its state, which is really the state of its people, 
#      2) rules for aging that state, which are rules for aging person objects, 
#.     3) tools for analyzing/reporting that state
#A trial is a collection of populations therefore is: 
#.     1) its state is the state of its populations
#.     2) rules for advancing those populations
#.     3) tools for analyzing/reporting that state

#I imagine that trial inclusion/exclusion criteria can be pushed to the population class, as filters in the pandas nhanes df
#as soon as that is read
#currently, imagine the worst case scenario with an extremely picky trial, we will need to create a very large population
#with a large memory cost, to get a small trial population

#also, the best case scenario for inferences would be to compare identical populations, so
#instead of creating different populations as part of trials, create just one and subject the two copies
#to different rules of advancing...

personModule = importlib.import_module("microsim.person")

class myPerson(Person):
    
    #I would like to have something that includes all attributes of the object, but 
    stateDynamic = ["riskFactors","treatments", "outcomes", "qalys"]
    stateStatic = ["selfReportedData", "randomEffects", "rng"]
    state = stateDynamic + stateStatic
    #state = ["riskFactors","treatments", "outcomes", "qalys", "selfReportedData", "randomEffects", "rng"]
    
    #I assume there will be dynamic and static risk factors
    riskFactorsDynamic = [
    #self._riskFactorsDynamic = [
            "age",
            "sbp",
            "dbp",
            "a1c",
            "hdl",
            "ldl",
            "trig",
            "totChol",
            "bmi",
            "anyPhysicalActivity",
            "afib",
            "waist",
            "alcoholPerWeek",
            "creatinine",
            "pvd",
        ]
        # not sure why this was in the past perhaps included as a risk factor
        # , 'otherLipidLoweringMedicationCount']
    
    #I can see education, smokingStatus becoming dynamic risk factors
    riskFactorsStatic = ["raceEthnicity",
                         "education",
                         "gender",
                         "smokingStatus"]
    
    riskFactors = riskFactorsStatic+riskFactorsDynamic
    
    #should treatments be defined using their disease (eg hypertension), 
    #their effect (eg antihypertension), their drug class (eg statin), or what risk factors they affect (eg bp)
    treatments = ["antiHypertensiveCount", "statin"]
    #treatments = ["antiHypertensive", "statin", "otherLipidLoweringMedicationCount"]
    #self._treatments = ["antiHypertensiveCount", "statin"]
    
    #I am not sure this is needed now...
    timeVaryingCovariates = copy.copy(riskFactorsDynamic)
    timeVaryingCovariates.extend(treatments)
    #self._timeVaryingCovariates = copy.copy(self._riskFactorsDynamic)
    #self._timeVaryingCovariates.extend(self._treatments)
    #self._timeVaryingCovariates.append("bpMedsAdded")
    
    selfReportedData = ["selfReportStrokeAge", "selfReportMIAge"]
    
    def __init__(
        self,
        age: int,
        gender: NHANESGender,
        raceEthnicity: NHANESRaceEthnicity,
        sbp: int,
        dbp: int,
        a1c: float,
        hdl: int,
        totChol: int,
        bmi: float,
        ldl: int,
        trig: int,
        waist: int,  # Waist circumference in cm
        anyPhysicalActivity: int,
        education: Education,
        smokingStatus: SmokingStatus,
        alcohol: AlcoholCategory,
        antiHypertensiveCount: int,
        statin: int,
        otherLipidLoweringMedicationCount: int,
        creatinine: float,
        initializeAfib: Callable,
        initializationRepository=None, #do we need this?
        selfReportStrokeAge=None,
        selfReportMIAge=None,
        randomEffects=None,
        rng=None,
        #risk_model_repository,
        #outcome_model_repository,
        #qaly_assignment_strategy,
        **kwargs,
    ) -> None:

        super().__init__(age,
            gender,
            raceEthnicity,
            sbp,
            dbp,
            a1c,
            hdl,
            totChol,
            bmi,
            ldl,
            trig,
            waist,  # Waist circumference in cm
            anyPhysicalActivity,
            education,
            smokingStatus,
            alcohol,
            antiHypertensiveCount,
            statin,
            otherLipidLoweringMedicationCount,
            creatinine,
            initializeAfib,
            initializationRepository,
            selfReportStrokeAge,
            selfReportMIAge,
            randomEffects,
            rng,
            **kwargs)
        
        #each person will advance on their own so keep track of this here
        self._currentWave = 0
        #need to double check that each person needs indeed their own stream
        seedSequence = np.random.SeedSequence()
        self._rng = np.random.default_rng(seed=seedSequence)
     
    def is_alive(self):
        return self._alive[-1]
    
    def get_next_treatment(self, treatment, treatmentRepository, rng=None):
        model = treatmentRepository.get_model(treatment)
        return model.estimate_next_risk(self, rng=rng)
    
    def advance(self, years, treatmentStrategies, repositories):
        for yearIndex in range(years):
            if self.is_alive():
                self._currentWave += 1
                self.advance_risk_factors(repositories["riskFactorsDynamic"])
                self.advance_treatments(repositories["treatments"], treatmentStrategies)
                self.update_risk_factors
                self.advance_outcomes(repositories["outcomes"])
                self.advance_qalys(repositories["qalys"])
    
    #may need to fix alcohol because it needs to convert the risk to a category I think
    #may also need to implement the apply bounds functionality that is present in the current advance risk factors method
    #the rng=self._rng will eventually not be needed when estimate_next_risk functions utilize the person's own rng stream
    def advance_risk_factors(self, rfdRepository):
        for rf in self.riskFactorsDynamic:
            #rfdRepository.get_model(rf).estimate_next_risk(self)])
            setattr(self, "_"+rf, getattr(self,"_"+rf)+[self.get_next_risk_factor(rf, rfdRepository, rng=self._rng)]) 
            
    def advance_treatments(self, treatmentRepository, treatmentStrategies):
        for treatment in self.treatments:
            #applies the default treatments
            #it is not clear to me why treatment strategies affect the person attributes directly
            #whereas treatments affect the person attributes indirectly through the attribute models
            setattr(self, "_"+treatment, getattr(self,"_"+treatment)+[self.get_next_treatment(treatment, treatmentRepository, 
                                                                                              rng=self._rng)]) 
            #choice of words: get_next implies that it returns the final/next quantity, update implies that it modifies
            #that quantity in place
            #the vectorized bp treatment strategies are modifying the rows in place whereas the changes/absolute values are 
            #returned for person objects, the code is much more simple if the person is modified in place with treatment
            #strategies so do that for person objects
            #these two functions will need to be defined
            if treatmentStrategies[treatment] is not None:
                treatmentStrategies[treatment].update_next_treatment(self)
                #I want to make it explicit and more obvious that treatments update the risk factors
                treatmentStrategies[treatment].update_next_risk_factors(self)
    
    
setattr(personModule, "Person", myPerson)

In [14]:
popModule = importlib.import_module("microsim.population")

class myPopulation:
    
    def __init__(self, people):
        self._people = people
        
        #maybe do not set this as an attribute, so that I can parallelize the advancement later
        #self._numberAlive = self.get_numberAlive()
        
        #self._ageStandards = {}
        # luciana tag: discuss with luciana...want to keep track of the sim wave htat is currently running, while running
        # and also the total number of years advanced...need to think about how to do this is a way that will be safe
        # this approach has major risks if you forget to update one of these variables
        self._totalWavesAdvanced = 0
        #self._currentWave = 0
        
        #self.num_of_processes = 8

        #treatment strategies and the 3 repositories are the rules by which person objects can advance to the future
        #these will differ between populations and person objects need to obtain them from the population
        
        #every repository will need to have a model for each corresponding person attribute
        #eg the riskFactorDynamic repo will need to have a model for each item in the Person.riskFactorsDynamic list
        self._repositories = {"riskFactorsDynamic": None,
                              "treatments": None, #these are the default treatments in a population
                              "outcomes": None,
                              "qalys": None}
        
        #for all items in the Person.treatments list there can be in principle a treatmentStrategy 
        #I can imagine all of these being merged in a single data structure and passed on to the person objects
        #as a single argument, eg say pop._advancementRules
        #I can also imagine this as a class populationAdvancementRules
        #so eg NHANESDirectSamplePop will need to initialize 2 things, a population and the populationAdvancementRules
        #also, every item in the Person.treatments can have in principle a treatment strategy
        #the keys in this dictionary need to be the same as the self._treatments list
        self._treatmentStrategies = {"antiHypertensiveCount": None, 
                                     "statin": None}
        #self._treatmentStrategies = {"bp": None}
        #self._bpTreatmentStrategy = None
        
        #any repositories that are needed in a population method, will need to be included in the population class
        #perhaps in a default way, eg by not changing anything or doing nothing
        #currently, subclasses are the ones that define repositories that the population class actually needs
      
    def get_numberAlive(self):
        return sum(list(map(lambda x: int(x.is_alive()), self._people)))
    
    def advance(self, years, rng=None):
            
        logging.info(f"processing years: {self._totalWavesAdvanced}-{self._totalWavesAdvanced+years}")
        list(map(lambda x: x.advance(years, treatmentStrategies=self._treatmentStrategies, repositories=self._repositories), 
                 self._people))
        self._totalWavesAdvanced += years #sampling from NHANES is wave 0
        
setattr(popModule, "Population", myPopulation)
setattr(popModule, "Person", myPerson)
#setattr(popModule, "CohortRiskModelRepository", myCohortRiskModelRepository)

In [15]:
class NHANESDirectSamplePopulation(myPopulation):
    """Simple base class to sample with replacement from 2015/2016 NHANES"""

    def __init__(
        self,
        n,
        year,
        filter=None,
        generate_new_people=True,
        model_reposistory_type="cohort",
        random_seed=None,
        weights=None,
        rng=None,
    ):

        nhanes = pd.read_stata("microsim/data/fullyImputedDataset.dta")
        nhanes = nhanes.loc[nhanes.year == year]
        self._outcome_model_repository = OutcomeModelRepository()
        #rng = np.random.default_rng(rng)
        people = build_people_using_nhanes_for_sampling(
            nhanes,
            n,
            self._outcome_model_repository,
            filter=filter,
            random_seed=random_seed,
            weights=weights,
            rng=rng,
        )
        super().__init__(people)
        self._qaly_assignment_strategy = QALYAssignmentStrategy()
        self.n = n
        self.year = year
        self._initialize_risk_models(model_reposistory_type)

    def copy(self):
        newPop = NHANESDirectSamplePopulation(self.n, self.year, False)
        newPop._people = copy.deepcopy(self._people)
        return newPop

    def _initialize_risk_models(self, model_repository_type):
        if model_repository_type == "cohort":
            self._risk_model_repository = myCohortRiskModelRepository()
        elif model_repository_type == "nhanes":
            self._risk_model_repository = NHANESRiskModelRepository()
        else:
            raise Exception("unknwon risk model repository type" + model_repository_type)

In [16]:
#from microsim.population import NHANESDirectSamplePopulation
#from popModule import NHANESDirectSamplePopulation
popSize = 10
pop = NHANESDirectSamplePopulation(popSize, 2017, rng=rngStream)

In [17]:
pop._people.iloc[0].advance_risk_factors(pop._risk_model_repository)

In [18]:
pop._people.iloc[0]._sbp

[103.33333333333333, 105.23750931706877]

In [19]:
pop._people.iloc[0]._alcoholPerWeek

[<AlcoholCategory.SEVENTOTHIRTEEN: 2>, <AlcoholCategory.ONETOSIX: 1>]

In [23]:
pop._people.iloc[0].advance_treatments(pop._risk_model_repository, pop._treatmentStrategies)

In [24]:
pop._people.iloc[0]._antiHypertensiveCount

[0.0, 1, 0]

In [25]:
pop._people.iloc[0]._statin

[0, False, False]

In [None]:
myPerson.riskFactors

In [None]:
pop._risk_model_repository

In [None]:
pop._risk_model_repository._repository["age"] = AgeModel()

In [None]:
pop._risk_model_repository._repository

In [13]:
pop._repositories

{'riskFactorsDynamic': None,
 'treatments': None,
 'outcomes': None,
 'qalys': None}