In [1]:
#!/usr/bin/python
%matplotlib inline

import numpy as np
import pandas as pd


In [2]:
#read in data from first cleaning file

df=pd.read_csv('data/clean1-out.csv')#,index=False)

df.columns

Index(['Unnamed: 0', 'master_ptid', 'enrollcohort', 'infsex',
       'haart_start_sregimen', 'ART_mo', 'age_yr', 'vload', 'cd4count',
       'total_hivdna_persist2020', 'intact_hivdna_persist2020',
       'tot_tcells_persist2020', 'cens_vl', 'cens_intact', 'intact_corrected',
       'short_id', 'co', 'log10VL', 'log10tot', 'log10int', 'log10def'],
      dtype='object')

In [3]:
## FIXED parameters

#thresholds VLs (I varied these for sensitivity analysis)
threshold_name=''
suppressVL_threshold = np.log10(1e3) #has to go down below this by 1yr
blipVL_threshold = np.log10(3e3)     #can't go back up above this (used 3e3 before)
suppressmo_threshold = 12            #suppress within 1 year
minn=0                               #the required number of intact time points? can be zero

#which cohorts?
cnames = ['0: OPH03', '1: OPH612-->OPH03', '2: IHE, not OPH612', '8: PAD-INK', '10: PAD only']

#df = df[df['enrollcohort']=='0: OPH03'] #restrict to OPH


In [4]:
#Number of OPH kids in whole analysis

N0 = len(df['master_ptid'].unique())
OPHids = df[df['co']=='OPH']['master_ptid'].unique()
Noph = len(OPHids)

print('total ppts=',N0,', cohorts=',len(df['enrollcohort'].unique()),', OPH ppts=',Noph)

total ppts= 120 , cohorts= 3 , OPH ppts= 120


In [5]:
#add easier to use variables

#change sex to 0 and 1 (male is 1)
f0m1 = np.zeros(len(df))
f0m1[np.where(df['infsex']=='Male')]=1
df['f/m']=f0m1

#numeral cohorts
enco = np.zeros(len(df))
for ico,co in enumerate(df['enrollcohort'].unique()):
    enco[np.where(df['enrollcohort']==co)]=ico
df['cohort']=enco

#numeral ART
ARTreg = np.zeros(len(df))
for iA,A in enumerate(df['haart_start_sregimen'].unique()):
    ARTreg[np.where(df['haart_start_sregimen']==A)]=iA
df['ARTregimen']=ARTreg
regimens = df['haart_start_sregimen'].unique() #the key

#get ratio
df['ratio']=df['log10int']-df['log10def']

#age when ART started
agel=[]
for p in df['master_ptid'].unique():
    tdf=df[df['master_ptid']==p]
    for i in range(len(tdf)):
        agel.append(tdf['age_yr'].min()*12)
df['agemo_at_ART']=agel

df.columns

Index(['Unnamed: 0', 'master_ptid', 'enrollcohort', 'infsex',
       'haart_start_sregimen', 'ART_mo', 'age_yr', 'vload', 'cd4count',
       'total_hivdna_persist2020', 'intact_hivdna_persist2020',
       'tot_tcells_persist2020', 'cens_vl', 'cens_intact', 'intact_corrected',
       'short_id', 'co', 'log10VL', 'log10tot', 'log10int', 'log10def', 'f/m',
       'cohort', 'ARTregimen', 'ratio', 'agemo_at_ART'],
      dtype='object')

In [6]:
#function to figure out when ppts are suppressed
#they need to suppress and if they blip we stop following
#if suppression happens late we can deal with that later using suppress mo

def get_suppress_times(tdf):
    tt=tdf['ART_mo']
    vl=tdf['log10VL']

    #look for first suppressed time point, then if goes back above, stop
    supp_mo=0; blip_mo=0; blip_VL=0
    supp=0 #set non
    suppress_index_l=[]
    for i in range(len(tdf)):
        #print(tdf['ART_mo'].iloc[i],tdf['log10VL'].iloc[i])
        vli = vl.iloc[i]
        ti=tt.iloc[i]
        
        if supp==0:
            if vli<suppressVL_threshold: #1000 threshold?
                supp=1
                suppress_index_l.append(i)
                supp_mo=ti
        elif supp==1:
            if vli<blipVL_threshold: #second threshold? usually set 3000 to allow for some small blips
                suppress_index_l.append(i)
            else:
                blip_mo=ti
                supp=2
                blip_VL=vli
    
    return supp_mo,blip_mo,blip_VL,suppress_index_l


In [7]:
#make a data frame with time, VL, cd4, intact, defective
#selecting only CWH who suppress VL by 1yr?
#where time ranges from 0 (note some negatives need to be dropped, or could impute t=0 from set point)
#up to the time of any blips

#now make the down selected list of ppts with a couple extra rules about reservoir measurements

all_suppress_df = pd.DataFrame() #bigdf for output

good_l=[] #list for people who achieve suppression

nl=[]
print('pid','n_int')
for p in OPHids:
    tdf=df[df['master_ptid']==p]
            
    supp_mo,blip_mo,blip_VL,suppress_index_l = get_suppress_times(tdf) #use function, if there are any!
        
    #plt.plot(tdf['ART_mo'],tdf['log10VL'],color='gray',alpha=0.1)#additional rules?
    
    #make sure there are any at all suppressed time points
    if suppress_index_l:
        if supp_mo<suppressmo_threshold: #needs to suppress within a year
            
            suppress_df = tdf.iloc[:np.max(suppress_index_l)].copy() #subset until
            
            suppress_df['suppress_mo']=np.ones(len(suppress_df))*supp_mo
            suppress_df['blip_mo']=np.ones(len(suppress_df))*blip_mo
            
            suppress_df = suppress_df.drop(axis=1,labels=['vload', 'total_hivdna_persist2020', 
                            'intact_hivdna_persist2020', 'tot_tcells_persist2020',])            
                
            stdf=suppress_df[['log10int','cens_intact']].dropna()

            print(p,len(stdf))
            #Must be more than NI intact measurements?
            if (len(stdf)>minn):
                #if (stdf['cens_intact']<1).any(): #make sure there are some that wasn't censored?
                nl.append(len(stdf))
                all_suppress_df = all_suppress_df._append(suppress_df)
                good_l.append(p)
                
Ng=len(good_l)


pid n_int
311000119 3
311000219 0
311000319 4
311000419 5
311000519 2
311000719 2
311001019 3
311001219 4
311001919 3
311002019 1
311002219 3
311002719 5
311002819 3
311002919 7
311003119 4
311003419 3
311003519 4
311003819 0
311004619 5
311004719 4
311004919 9
311005019 6
311005419 0
311005519 5
311005719 7
311006019 5
311006219 3
311006619 3
311006719 4
311006819 0
311007319 8
311007419 1
311007519 1
311007619 4
311007819 10
311008019 2
311008319 5
311008819 3
311009619 0
311009719 6
311009819 4
311010119 1
311010219 1
311010419 0
311010519 1
311010619 0
311010819 2
311010919 2
311011019 2
311011419 0
311011519 0
311011619 5
311011719 1
311011819 0
311011919 0
311012019 3
311012219 3
311012419 4
311012519 2
311012619 3
311012819 0
311012919 4
311013019 1
311013119 0
311013219 1
311013319 4
311013419 5
311013519 0
311013719 0
311013919 3


In [8]:
print('ppts=',Ng,'cohorts=',all_suppress_df['enrollcohort'].unique())

ppts= 55 cohorts= ['0: OPH03' '1: OPH612-->OPH03' '2: IHE, not OPH612']


In [10]:
#couple quick fixes to make analysis easier

all_suppress_df['log10CD4']=np.log10(all_suppress_df['cd4count'])

wideout = all_suppress_df[['short_id','agemo_at_ART','f/m','ARTregimen', 'ART_mo',
                         'log10VL', 'cens_vl' ,'log10CD4', 'log10int', 'cens_intact', 'log10def', 'ratio']]


wideout = wideout.rename(columns={'ART_mo': 'time_ART_mo'})
wideout = wideout.rename(columns={'ratio': 'log10ratio'})

## PRINT OUT CSV

wideout.to_csv('data/wideout'+threshold_name+'.csv')

wideout[wideout['time_ART_mo']<12].to_csv('data/wideout-pre1yr'+threshold_name+'.csv')
wideout[wideout['time_ART_mo']>12].to_csv('data/wideout-post1yr'+threshold_name+'.csv')