In [1]:
%matplotlib notebook
%matplotlib inline
%autosave 120

Autosaving every 120 seconds


In [2]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandarallel import pandarallel
import multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from microsim.bp_treatment_strategies import *

from microsim.outcome_model_repository import OutcomeModelRepository
from microsim.outcome_model_type import OutcomeModelType
from microsim.person import Person

from microsim.population import NHANESDirectSamplePopulation

from microsim.bp_treatment_strategies import SprintTreatment

from microsim.sim_settings import simSettings

from microsim.trials.trialset import TrialsetParallel, TrialsetSerial
from microsim.trials.trial_description import TrialDescription
from microsim.trials.logistic_regression_analysis import LogisticRegressionAnalysis
from microsim.trials.linear_regression_analysis import LinearRegressionAnalysis
from microsim.trials.outcome_assessor import OutcomeAssessor
from microsim.trials.attribute_outcome_assessor import AttributeOutcomeAssessor
from microsim.trials.attribute_outcome_assessor import AssessmentMethod
from microsim.trials.risk_filter import RiskFilter
from microsim.trials.trial_utils import get_analysis_name, randomizationSchema

In [4]:
#any microsim dir will work, just need to access the NHANES data
microsimDir = "/users/PAS2164/deligkaris/MICROSIM/CODE/microsim"
#microsimDir = "/Users/deligkaris.1/OneDrive - The Ohio State University Wexner Medical Center/MICROSIM/CODE/microsim"
os.chdir(microsimDir)

In [5]:
#at the beginning turn on pandarallel if you are planning to use TrialsetSerial
#use_memory_fs flag does not seem to affect performance in OSC computers
pandarallel.initialize(verbose=1) #microsim by default now does not initialize pandarallel
simSettings.pandarallelFlag = True #with this flag all new population instances will be set to use pandarallel

In [6]:
%%time
#with pandarallel on these two steps are faster
#12min with 10,000, 22min with 20,000, 61min with 60,000, 102min with 100,000
#osc: 27min with 10,000 (1 node)
pop = NHANESDirectSamplePopulation(10000, 1999)
alive, df = pop.advance_vectorized(200)

  yield data[chunk_]


CPU times: user 17min 9s, sys: 7min 47s, total: 24min 56s
Wall time: 27min 13s


In [None]:
def getDFForPersonCol(person):
    df = pd.DataFrame(person.get_final_wave_state_as_dict())
    df['cvRisk'] = df.apply(OutcomeModelRepository().get_risk_for_person, args=(OutcomeModelType.CARDIOVASCULAR, 1, True), axis='columns')
    df['demRisk'] = df.apply(OutcomeModelRepository().get_risk_for_person,  args=(OutcomeModelType.DEMENTIA, 1, True), axis='columns')
    return df

dfs = pop._people.apply(getDFForPersonCol)

In [None]:
#pop._people.iloc[1].get_final_wave_state_as_dict()

In [None]:
allAgesDF = pd.concat(dfs.tolist())

In [None]:
allAgesDF.head()

In [None]:
#allAgesDF.loc[allAgesDF.demRisk > 1, 'demRisk'].count(), allAgesDF.loc[allAgesDF.demRisk > 1, 'demRisk']

In [None]:
allAgesDF.loc[allAgesDF.demRisk > 1, 'demRisk'] = 1

In [None]:
allAgesDF.demRisk.describe(), allAgesDF.cvRisk.describe()

In [None]:
#if some points are outside of the min and max of this then qcut will return them to belong in NaN
#np.arange(increment, 1.0, increment)

In [None]:
#we need to have n+1 points in order to have n intervals
#len(np.arange(increment, 1.0, increment)),len(np.arange(0, 1.0, increment)),len(np.arange(0, 1.0+increment, increment))

In [None]:
nQuantiles = 25
increment = 1.0/nQuantiles
#qcut will move the left boundary to a value lower than the min, so that the left end (which is open)
#will include the min
allAgesDF['demRiskQuantile'] = pd.qcut(allAgesDF.demRisk, np.arange(0, 1.0+increment, increment))
allAgesDF['cvRiskQuantile'] = pd.qcut(allAgesDF.cvRisk, np.arange(0, 1.0+increment, increment))

In [None]:
#allAgesDF.shape

In [None]:
#there should not be any NaNs
allAgesDF['demRiskQuantile'].isna().sum(), allAgesDF['cvRiskQuantile'].isna().sum()

In [None]:
#allAgesDF['demRiskQuantile'].head()

In [None]:
meanDemRiskForQuantile = allAgesDF.groupby('demRiskQuantile')['demRisk'].mean()
meanCVRiskForQuantile = allAgesDF.groupby('cvRiskQuantile')['demRisk'].mean()
meanDemRiskForQuantileDict = {index : value for index, value in meanDemRiskForQuantile.items()}
meanCVRiskForQuantileDict = {index : value for index, value in meanCVRiskForQuantile.items()}
demQuantileNumberForQuantile = {quantile : index for index, quantile in enumerate(allAgesDF['demRiskQuantile'].value_counts().index)}
cvQuantileNumberForQuantile = {quantile : index for index, quantile in enumerate(allAgesDF['cvRiskQuantile'].value_counts().index)}
demQuantileForNumber = {index : quantile for index, quantile in enumerate(allAgesDF['demRiskQuantile'].value_counts().index)}
cvQuantileForNumber = {index : quantile for index, quantile in enumerate(allAgesDF['cvRiskQuantile'].value_counts().index)}

In [None]:
#allAgesDF['demRiskQuantile'].value_counts().index

In [None]:
#meanDemRiskForQuantile, meanDemRiskForQuantileDict, demQuintileNumberForQuantile

In [None]:
allAgesDF['meanDemRiskForQuantile'] = allAgesDF['demRiskQuantile']
allAgesDF['meanDemRiskForQuantile'].replace(meanDemRiskForQuantileDict, inplace=True)
allAgesDF['meanCVRiskForQuantile'] = allAgesDF['cvRiskQuantile']
allAgesDF['meanCVRiskForQuantile'].replace(meanCVRiskForQuantileDict, inplace=True)

allAgesDF['cvQuantileNum'] = allAgesDF['cvRiskQuantile']
allAgesDF['cvQuantileNum'].replace(cvQuantileNumberForQuantile, inplace=True)

allAgesDF['demQuantileNum'] = allAgesDF['demRiskQuantile']
allAgesDF['demQuantileNum'].replace(demQuantileNumberForQuantile, inplace=True)

In [None]:
allAgesDF.head()

In [None]:
### next step is to see how this varies by age...
### if we have wide distribution, then we can just go through deciles or something like that for trial sampling!

In [None]:
#allAgesDF.groupby(['cvQuantileNum', 'demQuantileNum'])['age'].count()

In [None]:
countByRisks = allAgesDF.groupby(['cvQuantileNum', 'demQuantileNum'],sort=True
                                )['age'].count().unstack(fill_value=0).stack()
#countByRisks

In [None]:
countByRisks = allAgesDF.groupby(['cvQuantileNum', 'demQuantileNum'],sort=False
                                )['age'].count().unstack(fill_value=0).stack()
#countByRisks

In [None]:
countByRisksDF = pd.DataFrame({'demRisk' : countByRisks.index.get_level_values(0),
                              'cvRisk' : countByRisks.index.get_level_values(1),
                              'ageCount' : countByRisks.values})
countByRisksDF

In [None]:
#used in plots later
X=np.arange(0,nQuantiles,1)
Y=np.arange(0,nQuantiles,1)
X, Y = np.meshgrid(X, Y)
X.shape

In [None]:
#this is done just in case the quantile categories are not ordered properly
countByRisksArray=np.zeros(X.shape)
for i,j,k in zip(countByRisksDF["demRisk"].values,countByRisksDF["cvRisk"].values,countByRisksDF["ageCount"].values):
    countByRisksArray[i,j]=k
#countByRisksArray

In [None]:
ageCountSum = countByRisksDF["ageCount"].sum() #sum of all person-ages
ageCountSumPerQbox = ageCountSum/(nQuantiles)/(nQuantiles) # of person-ages in one qq category, uniform ditribution
ageCountSum, ageCountSumPerQbox

In [None]:
percentCutoff = 1. #relevant to a uniform distribution, which qq categories to keep
cutoff = (ageCountSumPerQbox * percentCutoff)
cutoff

In [None]:
qqcategoriesToInclude = countByRisksDF.loc[countByRisksDF["ageCount"] > cutoff] 
#qqcategoriesToInclude, 
qqcategoriesToInclude["ageCount"].sum()/ageCountSum #percent of person-ages included

In [None]:
#if demRisk and cvRisk are not ordered correctly, this does not work...
#countByRisksDF['ageCount'].to_numpy().reshape(nQuantiles-0, nQuantiles-0) >5000 

In [None]:
fig, ax = plt.subplots()

c = ax.pcolormesh(X, Y, countByRisksArray, shading='nearest')
ax.set_title('dem risk quantiles vs. cv risk quantiles')
plt.xlabel("cv risk quantile")
plt.ylabel("dem risk quantile")
# set the limits of the plot to the limits of the data
ax.axis([0,nQuantiles-1, 0,nQuantiles-1])
fig.colorbar(c, ax=ax)

plt.show()

In [None]:
# decide on which quantiles to keep
included = np.zeros(X.shape)

for i, j in zip(qqcategoriesToInclude["demRisk"],qqcategoriesToInclude["cvRisk"]):
    included[i,j]=1
    
#included

In [None]:
fig, ax = plt.subplots()

c = ax.pcolormesh(X, Y, included, shading='nearest')
ax.set_title('1.0 for categories to keep, 0 otherwise')
plt.xlabel("cv risk quantile")
plt.ylabel("dem risk quantile")
# set the limits of the plot to the limits of the data
ax.axis([0,nQuantiles-1, 0,nQuantiles-1])
fig.colorbar(c, ax=ax)

plt.show()

In [None]:
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

In [None]:
ax = Axes3D(fig)

In [None]:
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
surf = ax.plot_surface(X, Y, countByRisksArray, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()

In [None]:
from matplotlib.colors import LightSource

In [None]:
fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))

ls = LightSource(270, 45)
# To use a custom hillshading mode, override the built-in shading and pass
# in the rgb colors of the shaded surface calculated from "shade".
rgb = ls.shade(countByRisksArray, cmap=cm.gist_earth, vert_exag=0.1, blend_mode='soft')
surf = ax.plot_surface(X,Y,countByRisksArray, rstride=1, cstride=1, facecolors=rgb,
                       linewidth=0, antialiased=False, shade=False)

plt.show()

In [None]:
demQuantiles = allAgesDF.demRisk.quantile(np.arange(0, 1+increment, increment).tolist())
cvQuantiles = allAgesDF.cvRisk.quantile(np.arange(0, 1+increment, increment).tolist())
quantileDF = pd.DataFrame(index=demQuantiles.index, data={'dementia' : demQuantiles.values, 'cv' : cvQuantiles.values})
quantileDF.to_csv("~/Desktop/dementiaAndCVQuintileThresholds.csv")
quantileDF

In [None]:
allAgesDF.demRisk.describe(), allAgesDF.cvRisk.describe()

In [None]:
qqcategoriesToInclude["demRiskInterval"] = qqcategoriesToInclude["demRisk"].apply(lambda x:demQuantileForNumber[x])
qqcategoriesToInclude["cvRiskInterval"] = qqcategoriesToInclude["cvRisk"].apply(lambda x:cvQuantileForNumber[x])
#countByRisks
#allAgesDF.sample(50)

In [None]:
qqcategoriesToInclude

In [None]:
qqcategoriesToInclude.to_csv("~/Desktop/qqcategoriesToInclude.csv")