This code is used to combine all clinical data for PPMI cohort (ignore GWAS data availability). It does not impute missing values, and will normalize values, and include baseline scores. Most code is copied from ppmi_manqi/preprocessed_code/combine_all_into_one.py. The results are used in RNAseq analysis planB step1: y~ beta1 * time+covariate(age, baseline, med, ..) (no GWAS) and step2: gene ~ beta2 * time+covariate

Results are: 
1) planB_clinical.csv. Normalized and unimputed clinical data for all PPMI samples (ignore GWAS data availability) that are PD, >1vist & >0.5yr gap, has RNA data
2) planB_meta.csv. Meta data with normalized ledd, pd_duration_time for PD RNAseq samples
3) planB_count.csv. Count data for PD.

In [1]:
import pandas as pd
import numpy as np
from extraction_function import extract_primary_diag
from cal_additional_scores import *
import scipy.stats as stats
from collections import Counter

In [2]:
#### data path
ppmi_data_path='../../ppmi_manqi/preprocessed_data/'

In [3]:
all_clinical_file=ppmi_data_path+'longitudinal_all_samples_remove_dupDates.csv'
all_biospecimen_file=ppmi_data_path+'biospecimen_all_samples.csv'
all_static_file=ppmi_data_path+'static_all_samples.csv'
patient_visit_file=ppmi_data_path+'patient_event_date.csv'
ledd_file=ppmi_data_path+'ledd.csv'

patient_visit_df=pd.read_csv(patient_visit_file)
ledd_df=pd.read_csv(ledd_file)
clinical_df=pd.read_csv(all_clinical_file)
biospecimen_df=pd.read_csv(all_biospecimen_file)

static_df=pd.read_csv(all_static_file)
diag_df=extract_primary_diag(path='../../ppmi_manqi/ppmi_data/')

visit_list = ['BL'] + ["V%02d" % i for i in range(1, 21)]

  clinical_df=pd.read_csv(all_clinical_file)


In [4]:
############# filter on static
### select static columns
static_keep_dict={'PATNO':'PATNO',
             'SXDT':'symptom_date','PDDXDT':'diagnosis_date',
             'ANYFAMPD':'is_family_pd','COHORT':'COHORT',
             'ENROLL_DATE':'ENROLL_DATE','ENROLL_STATUS':'ENROLL_STATUS','ENROLL_AGE':'ENROLL_AGE',
             'BIRTHDT':'BIRTHDT','SEX':'SEX','HANDED':'HANDED','EDUCYRS':'EDUCYRS', 'RAWHITE':'is_white'}
static_df=static_df[list(static_keep_dict.keys())]
static_df.rename(columns=static_keep_dict, inplace=True)
print('original data: '+str(static_df.shape[0])+' samples')
## remove withdraw patients
static_df=static_df[static_df.ENROLL_STATUS.isin(['Enrolled','Complete','Screened','Baseline'])]#2381
static_df.drop(columns=['ENROLL_STATUS'],inplace=True)
print('filter on enroll status: '+str(static_df.shape[0])+' samples')
## use PD samples only
static_df=static_df[static_df.COHORT==1]
print('keep PD cohort only: '+str(static_df.shape[0])+' samples')
## use PD diagnosis only
diag1=diag_df.loc[diag_df.PATNO.isin(static_df.PATNO)]
id1=list(set(list(diag1.loc[diag1.PRIMDIAG==1]['PATNO'])))#ids whose diagnosis is PD
static_df=static_df.loc[static_df.PATNO.isin(id1)]
print('keep PD diagnosis only: '+str(static_df.shape[0])+' samples')


original data: 3390 samples
filter on enroll status: 2381 samples
keep PD cohort only: 1034 samples
keep PD diagnosis only: 982 samples


In [5]:
## no need to filter sample ids
df1 = static_df
sample_ids=list(df1.PATNO)

In [6]:
############ Merge longitudinal
patient_visit_df=patient_visit_df.loc[patient_visit_df.EVENT_ID.isin(visit_list)]
patient_visit_df=patient_visit_df.loc[patient_visit_df.PATNO.isin(sample_ids)]

ledd_df=ledd_df.loc[ledd_df.EVENT_ID.isin(visit_list)]
ledd_df=ledd_df.loc[ledd_df.PATNO.isin(sample_ids)]

biospecimen_df=biospecimen_df.loc[biospecimen_df.EVENT_ID.isin(visit_list)]
biospecimen_df=biospecimen_df.loc[biospecimen_df.PATNO.isin(sample_ids)]

clinical_df=clinical_df.loc[clinical_df.PATNO.isin(sample_ids)]
clinical_df=clinical_df.loc[clinical_df.EVENT_ID.isin(visit_list)]

In [7]:
### Impute moca BL value
# Almost all moca are NA at BL, which makes further normalization infeasible. Here, we imput moca BL by moca SC 
all_clinical_file0=ppmi_data_path+'longitudinal_all_samples.csv'#data contains SC
all_clinical_df=pd.read_csv(all_clinical_file0)
all_clinical_df=all_clinical_df.loc[all_clinical_df.PATNO.isin(sample_ids)]
moca_sc_df=all_clinical_df.loc[all_clinical_df.EVENT_ID=='SC'][['PATNO','EVENT_ID','INFODT','MCATOT']]#moca SC df

moca_noSC_df=clinical_df[['PATNO','EVENT_ID','INFODT','MCATOT']]#non SC moca df
moca_sub_df=pd.concat([moca_noSC_df,moca_sc_df])# all moca df

# impute moca BL by SC
for index, row in moca_noSC_df.iterrows():
    patno = row['PATNO']
    event = row['EVENT_ID']
    if event == 'BL':
        if pd.isnull(row['MCATOT']):  # is BL is na
            ## impute by SC
            # bl_date = row['INFODT']
            one_sample = moca_sub_df.loc[moca_sub_df.PATNO == patno]
            if 'SC' in list(one_sample.EVENT_ID):#if has SC
                imputed_values = one_sample.loc[one_sample.EVENT_ID=='SC']['MCATOT']
                imputed_values.dropna(inplace=True)#remove na
                imputed_values = list(set(imputed_values))#remove duplicate
                if len(imputed_values)==1:
                    imputed_value=imputed_values[0]
                elif len(imputed_values)>1:
                    print('multiple value in SC')
                    imputed_value = np.nan
                else:
                    imputed_value = np.nan
            else:
                imputed_value=np.nan
            moca_noSC_df.at[index, 'MCATOT'] = imputed_value
        else:
            pass  # no need to impute
    else:
        pass

clinical_df['MCATOT_imputeBL']=moca_noSC_df['MCATOT']

  all_clinical_df=pd.read_csv(all_clinical_file0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imputed_values.dropna(inplace=True)#remove na


In [8]:
### calculate additional scores
## calculate tremor/pigd
tremor_items_columns=['NP2TRMR','NP3PTRMR','NP3PTRML','NP3KTRMR','NP3KTRML','NP3RTARU','NP3RTALU','NP3RTARL','NP3RTALL','NP3RTALJ','NP3RTCON']
pigd_items_columns=['NP2WALK','NP2FREZ','NP3GAIT','NP3FRZGT','NP3PSTBL']
tremor_scores,pigd_scores,ratio,phenotype=cal_tremor(clinical_df,tremor_items_columns,pigd_items_columns)
# append to df
clinical_df['tremor_scores']=tremor_scores
clinical_df['pigd_scores']=pigd_scores

## calculate GCO
gco_items=['NP1RTOT','NP1PTOT','NP2PTOT','NP3TOT','MSEADLG','MCATOT']
gco=cal_gco(clinical_df,gco_items)
clinical_df['gco']=gco

### select clinical total scores
clinical_df['updrs1']=clinical_df['NP1RTOT']+clinical_df['NP1PTOT']
clinical_keep_dict={'NP2PTOT':'updrs2','NP3TOT':'updrs3','NP4TOT':'updrs4','MSEADLG':'schwab',
                    'tremor_scores':'tremor_scores','pigd_scores':'pigd_scores',#'tremor_pigd_category':'tremor_pigd_category',
                    'MCATOT_imputeBL':'moca','JLO_TOTCALC':'benton','DVT_DELAYED_RECALL':'hvlt_delayed_recall','DVT_RECOG_DISC_INDEX':'hvlt_recog_disc_index',
                    'DVT_RETENTION':'hvlt_retention','DVT_TOTAL_RECALL':'hvlt_total_recall','DVS_LNS':'lns','DVT_SFTANIM':'semantic_fluency',
                    'DVT_SDM':'symbol_digit','geriatric_total':'gds','stai_total':'stai','scopa_total':'scopa','rem_total':'rem',
                    'ess_total':'ess','quip_total':'quip','updrs1':'updrs1','gco':'gco'}#'TOTAL_CORRECT':'upsit',
clinical_df=clinical_df[['PATNO','EVENT_ID','INFODT']+list(clinical_keep_dict.keys())]
clinical_df.rename(columns=clinical_keep_dict, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[c + '_impute'] = new
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[c + '_impute'] = new
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[c + '_impute'] = new
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [9]:
### select biomarker
biospecimen_df=biospecimen_df[['PATNO','EVENT_ID']+['alpha_syn', 'total_tau', 'abeta_42', 'p_tau181p']]

### merge longitudinal
pd_new = patient_visit_df.merge(clinical_df, on=['PATNO','EVENT_ID','INFODT'], how='outer')
pd_new = pd_new.merge(ledd_df, on=['PATNO','EVENT_ID','INFODT'], how='outer')
pd_new = pd_new.merge(biospecimen_df, on=['PATNO','EVENT_ID'], how='outer')

## remove INFODT is null
df2 = pd_new.loc[pd_new['INFODT'].notna()]
print('remove visits without date:'+str(len(set(df2.PATNO))))
df2.set_index(['PATNO','EVENT_ID','INFODT'], inplace=True)

remove visits without date:982


In [10]:
###### normalize
### 1. convert raw scores to the percentage of the maximum for that scale
### 2. normalized by population baseline mean and std

## step 1: scale
# trait maximum value
col_max=df2.max(axis=0)
# reverse moca and semantic_fluency
df2['moca']=30-df2['moca']
df2['semantic_fluency']=col_max['semantic_fluency']-df2['semantic_fluency']
df2_impute=df2/col_max
## step2: normalize
df2_impute_bl=df2_impute.loc[(slice(None), 'BL'), :]
bl_mean=df2_impute_bl.mean(axis=0)
bl_std=df2_impute_bl.std(axis=0)
df2_norm=(df2_impute-bl_mean)/bl_std
df2_norm=df2_norm.reset_index()

###### Merge static and longitudinal
df=df2_norm.merge(df1,on=['PATNO'],how='inner')

###### add additional covariated and normalize by z-score
## PD_duration, visit-diagnosis_date, month
pd_duration_days=pd.to_datetime(df.INFODT,format='%m/%Y')-pd.to_datetime(df.diagnosis_date,format='%m/%Y')
pd_duration = stats.zscore(pd_duration_days/np.timedelta64(1, 'M'))
pd_duration_norm = list(pd_duration)
df['pd_duration_norm']=pd_duration_norm

## PD symptom duration, visit-symptom_date, month
pd_duration_days=pd.to_datetime(df.INFODT,format='%m/%Y')-pd.to_datetime(df.symptom_date,format='%m/%Y')
symtom_duration=list(stats.zscore(pd_duration_days/np.timedelta64(1, 'M')))
df['symtom_duration_norm']=symtom_duration

## time, visit-baseline, year
time=[]
for i in range(df.shape[0]):
    patno=df.iloc[i,:]['PATNO']
    event=df.iloc[i,:]['EVENT_ID']
    date=df.iloc[i,:]['INFODT']
    bl=df.loc[(df.PATNO==patno) & (df.EVENT_ID=='BL')]['INFODT']
    diff=pd.to_datetime(date,format='%m/%Y')-pd.to_datetime(bl,format='%m/%Y')#event date - baseline date
    diff=diff/np.timedelta64(1, 'Y')
    time.append(diff.values[0])
df['time']=time

#age at onset, diagnosis date - birthdate, year, and z-score normalize
age_days=pd.to_datetime(df.diagnosis_date,format='%m/%Y')-pd.to_datetime(df.BIRTHDT,format='%m/%Y')
age=list(stats.zscore(age_days/np.timedelta64(1, 'Y')))
df['age_onset_norm']=age

# norm LEDD
ledd_norm=list(stats.zscore(df.LEDD))
df['ledd_norm']=ledd_norm



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['moca']=30-df2['moca']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['semantic_fluency']=col_max['semantic_fluency']-df2['semantic_fluency']


In [11]:
print(df.columns)

Index(['PATNO', 'EVENT_ID', 'INFODT', 'updrs2', 'updrs3', 'updrs4', 'schwab',
       'tremor_scores', 'pigd_scores', 'moca', 'benton', 'hvlt_delayed_recall',
       'hvlt_recog_disc_index', 'hvlt_retention', 'hvlt_total_recall', 'lns',
       'semantic_fluency', 'symbol_digit', 'gds', 'stai', 'scopa', 'rem',
       'ess', 'quip', 'updrs1', 'gco', 'LEDD', 'alpha_syn', 'total_tau',
       'abeta_42', 'p_tau181p', 'symptom_date', 'diagnosis_date',
       'is_family_pd', 'COHORT', 'ENROLL_DATE', 'ENROLL_AGE', 'BIRTHDT', 'SEX',
       'HANDED', 'EDUCYRS', 'is_white', 'pd_duration_norm',
       'symtom_duration_norm', 'time', 'age_onset_norm', 'ledd_norm'],
      dtype='object')


In [12]:
## append baseline values as a new column
value_columns=['updrs2', 'updrs3', 'updrs4', 'schwab',
       'tremor_scores', 'pigd_scores', 'moca', 'benton', 'hvlt_delayed_recall',
       'hvlt_recog_disc_index', 'hvlt_retention', 'hvlt_total_recall', 'lns',
       'semantic_fluency', 'symbol_digit', 'gds', 'stai', 'scopa', 'rem',
       'ess', 'quip', 'updrs1', 'gco', 'LEDD', 'alpha_syn', 'total_tau','abeta_42', 'p_tau181p']#trait columns
for value_col in value_columns:
    bl_df = df[df['EVENT_ID'] == 'BL'][['PATNO', value_col]].rename(columns={value_col: f'{value_col}.BL'})
    df = df.merge(bl_df, on='PATNO', how='left')

In [17]:
df_sub=df.loc[df['time']<3.5]#select data within 3 yr follow-ups (to align with RNAseq data time)
filtered_df = df_sub.groupby('PATNO').filter(lambda x: (x['time'].size > 1) & (x['time'].max() > 0.5))#remove if only has 1 visit or visit gap < 0.5 yr

In [62]:
## read meta
meta_df=pd.read_csv('../data/meta.csv')
meta_df=meta_df.loc[meta_df['Group']=='PD']#select PD only
sample_ids=[int(i.split('-')[1]) for i in meta_df.PID]
print(meta_df.shape)
# select patients have RNAseq
filtered_df_sub=filtered_df.loc[filtered_df['PATNO'].isin(sample_ids)]

## append static value to meta
append_meta_static=filtered_df_sub[['PATNO','is_family_pd','age_onset_norm']]#columns that append to meta
append_meta_static['PID']=['PP-'+str(i) for i in append_meta_static['PATNO']]#reformat patient id to the meta_df format
append_meta_static.drop_duplicates(inplace=True,ignore_index=True)

meta_df=pd.merge(meta_df,append_meta_static,on=['PID'],how='left')
print(meta_df.shape)

(1695, 7)
(1695, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  append_meta_static['PID']=['PP-'+str(i) for i in append_meta_static['PATNO']]#reformat patient id to the meta_df format
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  append_meta_static.drop_duplicates(inplace=True,ignore_index=True)


In [63]:
def get_visit_month(df):
    time=[]
    for i in range(df.shape[0]):
        patno=df.iloc[i,:]['PATNO']
        date=df.iloc[i,:]['INFODT']
        bl=df.loc[(df.PATNO==patno) & (df.EVENT_ID=='BL')]['INFODT']
        if len(bl)>0:# if has BL. Some only have SC
            bl=bl.item()
            diff=int(date.split('/')[0])-int(bl.split('/')[0])+(int(date.split('/')[1])-int(bl.split('/')[1]))*12#event date - baseline date in month
            time.append(diff)
        else:
            time.append(None)
    df['visit_month']=time
    return(df)

In [64]:
## append longitudinal value to meta
append_meta_longi=filtered_df_sub[['PATNO','INFODT','EVENT_ID','pd_duration_norm','ledd_norm']]
append_meta_longi=get_visit_month(append_meta_longi)
append_meta_longi.drop_duplicates(inplace=True,ignore_index=True)
# create ind to merge
append_meta_longi['ind']=['PP-'+str(append_meta_longi.iloc[i,:].PATNO)+'|'+str(int(append_meta_longi.iloc[i,:].visit_month)) for i in range(append_meta_longi.shape[0])]
meta_df['ind']=[meta_df.iloc[i,:].PID+'|'+str(int(meta_df.iloc[i,:].Timepoint)) for i in range(meta_df.shape[0])]

meta_df=pd.merge(meta_df,append_meta_longi[['ind','pd_duration_norm','ledd_norm']],on=['ind'],how='left')
meta_df.drop_duplicates(inplace=True,ignore_index=True)#3791's BL and V01 are the same date so there is one duplicate
meta_df.drop(columns=['ind','PATNO'],inplace=True)
print(meta_df.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visit_month']=time
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  append_meta_longi.drop_duplicates(inplace=True,ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  append_meta_longi['ind']=['PP-'+str(append_meta_longi.iloc[i,:].PATNO)+'|'+str(int(append_meta_longi.iloc[i,:].visit_month)) for i in range(append_meta_longi.shape[0])]


(1695, 11)


In [67]:
#### select a subset of RNA count 
count=pd.read_csv('../data/data.csv',index_col=0)
sub_count=count[[i for i in list(count.columns) if i in list(meta_df['ID'])]]# select samples that have meta data
### select transcript of protein-coding genes only
protein_coding_genes = pd.read_csv("../data/uniprot_annotated_proteome_transcript-to-gene-id.csv")
keep=[i for i in list(count.index) if i.split('.')[0] in list(protein_coding_genes['ensembl_gene_id'])]
sub_count=sub_count.loc[keep]

In [68]:
filtered_df_sub.to_csv('../data/planB_clinical.csv',index=False)
meta_df.to_csv('../data/planB_meta.csv',index=False)
sub_count.to_csv('../data/planB_count.csv')