In [12]:
#!/usr/bin/python
%matplotlib inline

import numpy as np
import pandas as pd


In [13]:
#read in the data
df=pd.read_csv('data/tOPH_OPHX_longitudinal1_subset_05may2022-dbrclean.csv')#,index=False)


Index(['source', 'enrollcohort', 'master_ptid', 'enrolldt', 'infdob', 'infsex',
       'haart_start_sregimen', 'haart_start_date', 'visitdate', 'vload',
       'cd4percent', 'cd4count', 'total_hivdna_persist2020',
       'total_hivdnat_persist2020', 'intact_hivdna_persist2020',
       'intact_hivdnat_persist2020', 'visitcode', 'oph03infptid',
       'ophxinfptid', 'ophx_cohort', 'month_persist2020',
       'sampleid_persist2020', 'short_id_persist2020', 'inf_id_persist2020',
       'ophxinfptid_persist2020', 'type_persist2020', 'type_code_persist2020',
       'pct_sheared_persist2020', 'dsi_persist2020', 'dna_sheared_persist2020',
       'tot_pbmcs_persist2020', 'tot_tcells_persist2020',
       'intact_neg_persist2020', 'min_pbmcs_persist2020',
       'hiv_reps_persist2020', 'rpp30_reps_persist2020',
       'indata_reslist_persist2020', 'pbmc_ul_persist2020',
       'tcell_ul_persist2020', 'sample_avail_persist2020',
       'hivdna_nodata_persist2020', 'indata_padlist_persist2020',
   

In [14]:
#drop all empty rows
df = df[~np.isnan(df['vload'])]
df = df[pd.notnull(df['haart_start_date'])]


In [15]:
#convert dates to times
tdf1 = pd.to_datetime(df['visitdate'])
tdf2 = pd.to_datetime(df['haart_start_date'])
tdf3 = pd.to_datetime(df['infdob'])

#track the length of ART as well as age at each visit
ARTl=[];agel=[]
for i in range(len(df)):

    tdel1=tdf1.iloc[i]-tdf2.iloc[i]
    tdel2=tdf1.iloc[i]-tdf3.iloc[i]
    
    ARTl.append(tdel1.days/365*12)
    agel.append(tdel2.days/365)
    
df['ART_mo'] = ARTl #ART duration in mo
df['age_yr'] = agel #age in years

In [16]:
#simplify df and drop some columns
simplerdf=df[['master_ptid','enrollcohort','infsex','haart_start_sregimen','ART_mo','age_yr',       
       'vload','cd4count','total_hivdna_persist2020', 'intact_hivdna_persist2020','tot_tcells_persist2020']].copy()

#deal with censoring of both intacts and viral load
cens_l=[]
cens_Vl=[]
corr_l=[]
shortid_l=[]
coh_l=[]
for i in range(len(simplerdf)):
    
    tdf=simplerdf.iloc[i]
    
    #zero observed intacts
    if tdf['intact_hivdna_persist2020']==0:
        corr_l.append(1/tdf['tot_tcells_persist2020']*1e6/2)
        cens_l.append(1)
    else:
        cens_l.append(0)
        corr_l.append(tdf['intact_hivdna_persist2020'])

    #viral load below LOD
    if tdf['vload']<100:
        cens_Vl.append(1)
    else:
        cens_Vl.append(0)

    shortid=str(tdf['master_ptid'])[4:7]
    shortid_l.append(shortid)
    
    if 'OPH' in tdf['enrollcohort']:
        coh_l.append('OPH')
    else:
        coh_l.append('PAD')

simplerdf['cens_vl']=cens_Vl
simplerdf['cens_intact']=cens_l
simplerdf['intact_corrected']=corr_l
simplerdf['short_id']=shortid_l
simplerdf['co']=coh_l

#add some columns
simplerdf['log10VL']=np.log10(simplerdf['vload'])
simplerdf['log10tot']=np.log10(simplerdf['total_hivdna_persist2020'])
simplerdf['log10int']=np.log10(simplerdf['intact_corrected'])
simplerdf['log10def']=np.log10(simplerdf['total_hivdna_persist2020']-simplerdf['intact_corrected'])

simplerdf.head()

Unnamed: 0,master_ptid,enrollcohort,infsex,haart_start_sregimen,ART_mo,age_yr,vload,cd4count,total_hivdna_persist2020,intact_hivdna_persist2020,tot_tcells_persist2020,cens_vl,cens_intact,intact_corrected,short_id,co,log10VL,log10tot,log10int,log10def
4,311000119,0: OPH03,Female,3TC;AZT;NVP,1.380822,0.523288,12435.0,,2982.4739,1233.7377,301640.96,0,0,1233.7377,1,OPH,4.094646,3.474577,3.091223,3.242724
7,311000119,0: OPH03,Female,3TC;AZT;NVP,3.320548,0.684932,540.0,,,,,0,0,,1,OPH,2.732394,,,
10,311000119,0: OPH03,Female,3TC;AZT;NVP,5.523288,0.868493,430.0,1741.0,1433.6752,496.25262,167321.39,0,0,496.25262,1,OPH,2.633468,3.156451,2.695703,2.971935
13,311000119,0: OPH03,Female,3TC;AZT;NVP,8.284932,1.09863,435.0,,,,,0,0,,1,OPH,2.638489,,,
19,311000119,0: OPH03,Female,3TC;AZT;NVP,13.873973,1.564384,465.0,,,,,0,0,,1,OPH,2.667453,,,


In [8]:
#only choose OPH!

simplerdf=simplerdf[simplerdf['co']=='OPH']

In [9]:
simplerdf.to_csv('data/clean1-out.csv')

In [10]:
#Numbers

N0 = len(simplerdf['master_ptid'].unique())
OPHids = simplerdf[simplerdf['co']=='OPH']['master_ptid'].unique()
Noph = len(OPHids)

print('total ppts=',N0,', cohorts=',len(simplerdf['enrollcohort'].unique()),', OPH ppts=',Noph)

total ppts= 120 , cohorts= 3 , OPH ppts= 120


In [11]:
#output data for monolix

monolixdf = simplerdf[simplerdf['co']=='OPH'][['enrollcohort', 'infsex', 'haart_start_sregimen',
       'ART_mo', 'age_yr', 'cens_intact', 'short_id', 'vload',
       'log10int', 'log10def']] #pick some columns and restrict to OPH

#rename to help autoload
monolixdf = monolixdf.rename(columns={'short_id':'id', 
                                      'ART_mo':'time_months_ART', 
                                      'log10int':'obs_intact', 
                                      'log10def':'obs_defective', 
                                      'vload':'regressor_V'})

monolixdf[monolixdf['regressor_V']<100]['regressor_V']=0

monolixdf.to_csv('data/clean1-out-formonolix.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monolixdf[monolixdf['regressor_V']<100]['regressor_V']=0
