In [2]:
### This code is used to prepare RNA-seq count data and meta data (group, age, sex, race) used for DESeq2
import pandas as pd
from collections import Counter
import numpy as np

In [9]:
### read raw count
df=pd.read_csv('../../../../Downloads/releases_2021_v2-5release_0510_rnaseq_subread_feature-counts_matrix.featureCounts.tsv',sep='\t',index_col=0)#gene x sample
df_t=df.T#sample x gene
df_t.head()

Geneid,ENSG00000242268.2,ENSG00000259041.1,ENSG00000207641.1,ENSG00000270112.3,ENSG00000280143.1,ENSG00000269416.5,ENSG00000263642.1,ENSG00000225275.4,ENSG00000158486.13,ENSG00000283967.1,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000265520.1,ENSG00000105063.18,ENSG00000181518.4,ENSG00000231119.2,ENSG00000178921.13
PP-3867-SVM6T1,1,0,0,0,6,0,9,0,4,0,...,530,1,0,95,0,0,815,0,2,97
PD-PDMZ724XCN-SVM18T1,0,0,0,0,9,1,43,0,1,0,...,194,0,0,39,0,2,929,0,0,130
PP-60044-SVM24T1,3,0,0,2,38,1,116,0,3,0,...,157,3,0,75,0,3,1792,0,2,186
PP-3604-SVM12T1,0,0,0,7,6,1,29,3,17,0,...,718,0,0,87,1,2,772,2,1,124
PP-3522-SVM12T1,1,0,0,1,27,4,94,0,10,0,...,805,2,0,99,0,0,2046,4,3,253


In [10]:
### split index into global patient id and visit time
patient_id=[]
visit_name=[]
visit_month=[]
cohort=[]
for i in df_t.index:
    seq=i.split('-') 
    # get patient id
    pid=seq[0]+'-'+seq[1]
    # get visit name
    suffix=seq[-1]
    ind1=suffix.find('M')
    ind2=suffix.find('T')
    v=suffix[ind1:ind2]
    # get visit number
    v_num=suffix[(ind1+1):ind2]
    if v_num=='0_5':
        v_num=0.5
    else:
        v_num=int(v_num)

    # append
    patient_id.append(pid)
    cohort.append(seq[0])
    visit_name.append(v)
    visit_month.append(v_num)

In [11]:
df_t.insert(loc=0, column='visit_month', value=visit_month)
#df_t.insert(loc=0, column='visit_name', value=visit_name)
df_t.insert(loc=0, column='cohort', value=cohort)
df_t.insert(loc=0, column='participant_id', value=patient_id)
df_t=df_t.sort_values(by=['participant_id','visit_month'])
df_t.reset_index(inplace=True)##index is participant id in RNA-seq
df_t.tail()

Geneid,index,participant_id,cohort,visit_month,ENSG00000242268.2,ENSG00000259041.1,ENSG00000207641.1,ENSG00000270112.3,ENSG00000280143.1,ENSG00000269416.5,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000265520.1,ENSG00000105063.18,ENSG00000181518.4,ENSG00000231119.2,ENSG00000178921.13
8456,PP-92490-SVM12T1,PP-92490,PP,12.0,0,0,0,0,11,1,...,635,1,0,87,0,1,1798,0,0,176
8457,PP-92490-SVM24T1,PP-92490,PP,24.0,0,0,0,1,13,0,...,703,1,0,69,0,1,1408,0,0,124
8458,PP-92834-BLM0T1,PP-92834,PP,0.0,0,0,0,0,2,0,...,58,1,0,17,0,2,697,1,3,42
8459,PP-92834-SVM6T1,PP-92834,PP,6.0,1,0,0,0,20,1,...,200,1,0,57,0,6,1641,0,10,113
8460,PP-92834-SVM12T1,PP-92834,PP,12.0,0,0,0,0,7,0,...,179,1,0,38,0,3,1108,0,1,52


In [12]:
## extract PP cohort only
sub_df=df_t.loc[df_t.cohort=='PP']
#rm if only 1 visit; total time gap < 0.5yr
sub_df_pid=set(list(sub_df.participant_id))
sub_df_filter=pd.DataFrame()
for i in sub_df_pid:
    one_patient=sub_df.loc[sub_df.participant_id==i]
    if one_patient.shape[0]>1:#more than 1 visit
        time_gap=list(one_patient.visit_month)[-1]-list(one_patient.visit_month)[0]
        if time_gap>0.5:#>0.5yr time gap
            if sub_df_filter.shape[0]==0:
                sub_df_filter=one_patient
            else:
                sub_df_filter=sub_df_filter.append(one_patient, ignore_index=True)
        else:
            pass
    else:
        pass

  sub_df_filter=sub_df_filter.append(one_patient, ignore_index=True)


In [13]:
print(sub_df_filter.shape)
sub_df_filter.head()

(4159, 58784)


Geneid,index,participant_id,cohort,visit_month,ENSG00000242268.2,ENSG00000259041.1,ENSG00000207641.1,ENSG00000270112.3,ENSG00000280143.1,ENSG00000269416.5,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000265520.1,ENSG00000105063.18,ENSG00000181518.4,ENSG00000231119.2,ENSG00000178921.13
0,PP-3666-BLM0T1,PP-3666,PP,0.0,0,0,0,0,9,0,...,345,1,0,68,0,2,1032,1,1,143
1,PP-3666-SVM6T1,PP-3666,PP,6.0,0,4,1,14,23,17,...,485,1,5,83,1,1,1570,9,4,210
2,PP-3666-SVM24T1,PP-3666,PP,24.0,3,0,1,7,8,3,...,198,2,1,52,0,1,787,7,0,103
3,PP-3666-SVM36T1,PP-3666,PP,36.0,0,0,0,0,9,2,...,298,2,0,54,0,4,870,0,0,111
4,PP-3607-BLM0T1,PP-3607,PP,0.0,0,0,0,2,19,2,...,1144,0,0,55,0,1,1474,1,1,218


In [3]:
#sub_df_filter.rename(columns={'index':'ID'}, inplace=True)#ID: participant id in RNAseq data
#sub_df_filter.to_csv('PP_rnaseq_filtered.csv',index=False)
sub_df_filter=pd.read_csv('../data/PP_rnaseq_filtered.csv')
sub_df_filter.head()

Unnamed: 0,ID,participant_id,cohort,visit_month,ENSG00000242268.2,ENSG00000259041.1,ENSG00000207641.1,ENSG00000270112.3,ENSG00000280143.1,ENSG00000269416.5,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000265520.1,ENSG00000105063.18,ENSG00000181518.4,ENSG00000231119.2,ENSG00000178921.13
0,PP-3666-BLM0T1,PP-3666,PP,0.0,0,0,0,0,9,0,...,345,1,0,68,0,2,1032,1,1,143
1,PP-3666-SVM6T1,PP-3666,PP,6.0,0,4,1,14,23,17,...,485,1,5,83,1,1,1570,9,4,210
2,PP-3666-SVM24T1,PP-3666,PP,24.0,3,0,1,7,8,3,...,198,2,1,52,0,1,787,7,0,103
3,PP-3666-SVM36T1,PP-3666,PP,36.0,0,0,0,0,9,2,...,298,2,0,54,0,4,870,0,0,111
4,PP-3607-BLM0T1,PP-3607,PP,0.0,0,0,0,2,19,2,...,1144,0,0,55,0,1,1474,1,1,218


In [4]:
### read demographic data
demo=pd.read_csv('../../amp_pd_manqi/data/apm-pd-data/clinical/releases_2022_v3release_1115_clinical_Demographics.csv')
demo=demo[['participant_id','age_at_baseline','sex','race']]
### read enrollment data
version='releases_2022_v3release_1115_clinical_'
data_path='../../amp_pd_manqi/data/apm-pd-data/clinical/'
enroll=pd.read_csv(data_path+version+'Enrollment.csv')
## select a subset having RNAseq
sub_df_pid=set(list(sub_df_filter.participant_id))
sub_enroll=enroll.loc[enroll.participant_id.isin(list(sub_df_pid))]
sub_demo=demo.loc[demo.participant_id.isin(list(sub_df_pid))]

Unnamed: 0,participant_id,age_at_baseline,sex,race
0,BF-1001,55,Male,White
1,BF-1002,66,Female,White
2,BF-1003,61,Male,White
3,BF-1004,62,Male,White
4,BF-1005,61,Female,White


In [6]:
### read demo data from PPMI platform to double check if AMP-PD id and PPMI id are the same
pp_demo=pd.read_csv('../../ppmi_manqi/preprocessed_data/static_all_samples.csv')
pp_demo['participant_id']=['PP-'+str(i) for i in pp_demo['PATNO']]
sub_pp_demo=pp_demo.loc[pp_demo.participant_id.isin(list(sub_df_pid))]# select a subset

Unnamed: 0,PATNO,SXDT,PDDXDT,ANYFAMPD,BIOMOMPD,BIODADPD,FULSIBPD,HAFSIBPD,MAGPARPD,PAGPARPD,...,HANDED,HISPLAT,RAASIAN,RABLACK,RAHAWOPI,RAINDALS,RANOS,RAWHITE,EDUCYRS,participant_id
0,3001,08/2009,04/2010,0.0,0.0,0.0,0.0,,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.0,PP-3001
1,3002,02/2009,02/2010,0.0,0.0,0.0,0.0,,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.0,PP-3002
2,3003,05/2006,03/2009,0.0,0.0,0.0,0.0,,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.0,PP-3003
3,3005,08/2009,,0.0,0.0,1.0,0.0,,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,PP-3005
4,3006,12/2009,11/2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,14.0,PP-3006


In [8]:
### check if the same IDs in different platforms have the same demographic information
male1=list(sub_pp_demo.loc[pp_demo.SEX==1]['participant_id'])
male2=list(sub_demo.loc[(sub_demo.sex=='Male') & (sub_demo.participant_id.isin(sub_pp_demo.participant_id))]['participant_id'])
male1.sort()
male2.sort()
male1==male2
# s = set(male2)
# temp3 = [x for x in male1 if x not in s]
# len(temp3)

True

In [9]:
### prepare meta data and count data
meta=[]#id,pid,time,group,age,sex,race
new_df=[]
n=0#not found that participant in enroll data
for i in range(sub_df_filter.shape[0]):
    one_record=sub_df_filter.iloc[i,:]
    all_name=one_record['ID']#the id in the rnaseq
    pid=one_record.participant_id
    time=one_record.visit_month
    one_enroll=enroll.loc[enroll.participant_id==pid]
    one_demo=demo.loc[demo.participant_id==pid]
    if one_enroll.shape[0]>0:
        one_enroll=one_enroll.study_arm.item()
        if one_enroll=='Healthy Control':
            meta.append([all_name,pid,time,'Control',one_demo['age_at_baseline'].item(),
                         one_demo['sex'].item(),one_demo['race'].item()])
            new_df.append(one_record.iloc[4:].to_list())
        elif one_enroll=='PD':
            meta.append([all_name,pid,time,'PD',one_demo['age_at_baseline'].item(),
                         one_demo['sex'].item(),one_demo['race'].item()])
            new_df.append(one_record.iloc[4:].to_list())
        else:
            pass
    else:
        n+=1

In [10]:
meta_df=pd.DataFrame(meta,columns=['ID','PID','Timepoint','Group','Age','Sex','Race'])

new_df=pd.DataFrame(new_df,columns=list(sub_df_filter.columns)[4:])
new_df['ID']=meta_df['ID']
new_df.set_index('ID',inplace=True)
new_df=new_df.T

In [12]:
meta_df.to_csv('../data/meta.csv',index=False)
new_df.to_csv('../data/data.csv')