In [1]:
import pandas as pd

#### Load data

In [2]:
well_map = pd.read_csv(r'../../raw_data/well_PTID_map.csv')
rid_map = pd.read_csv(r'../../raw_data/RID_PTID_map.csv')

well_dict = {well_map.iloc[i]["WGS_SAMPLE_NUMBER"]: well_map.iloc[i]["ADNI_PTID"] for i in range(well_map.shape[0])}
rid_dict =  {rid_map.iloc[i]["RID"]:                rid_map.iloc[i]["PTID"]       for i in range(rid_map.shape[0])}

In [3]:
cnv = pd.read_csv(r'../../raw_data/cnv.tsv', sep='\t')
expr = pd.read_csv(r'../../raw_data/gene_expression.tsv', sep='\t')
metab = pd.read_csv(r'../../raw_data/metabolomics.csv')
metab_dict = pd.read_csv(r'../../raw_data/metabolomics_dict.csv')

#### Create new dataframes with PTIDs only

Expression data

In [4]:
expr_dict = {}
for i in range(2, len(expr.columns)):
    expr_dict[expr.columns[i]] = expr.columns[i][1:]

expr_new = expr.rename(columns=expr_dict)

Metabolomics data

In [5]:
# metab_new.filter(regex='LOGTRANSFORMFLAG').drop_duplicates().transpose().drop_duplicates()

metab_rids = metab["RID"].to_numpy()
metab_ptids = []
for i in range(metab_rids.shape[0]):
    if metab_rids[i] in rid_dict.keys():
        metab_ptids.append(rid_dict[metab_rids[i]])

metab_new = metab.merge(rid_map[["RID"]], how='inner', on="RID").drop(columns=["RID"])
metab_new["PTID"] = metab_ptids

cols = metab_new.columns.tolist()
cols.remove("PTID")
cols.insert(1, "PTID")
metab_new = metab_new[cols]
del cols


metab_new = metab_new.loc[metab_new['SUBJECT_FLAG'] == 0]
keep_cols = [
    "PTID",
    "CA",
    "CDCA",
    "DCA",
    "GCA",
    "GCDCA",
    "GDCA",
    "GLCA",
    "GUDCA",
    "TCA",
    "TCDCA",
    "TDCA",
    "TLCA",
    "TMCA_A_B",
    "TUDCA",
    "UDCA",
    "CA_CDCA",
    "DCA_CA",
    "GLCA_CDCA",
    "GDCA_CA",
    "GDCA_DCA",
    "TDCA_CA",
    "TLCA_CDCA",
    "TDCA_DCA",
]
metab_new = metab_new[keep_cols]

Copy number variation data

In [6]:
cnv_new = cnv.rename(columns=well_dict)

#### Find intersection of PTIDs across all datasets

In [7]:
cnv_ptids = set(cnv_new.columns[6:])
expr_ptids = set(expr_new.columns[2:])
metab_ptids = set(metab_new['PTID'])


final_ptids = list(expr_ptids.intersection(cnv_ptids.intersection(metab_ptids)))
final_ptids = pd.DataFrame({"PTID": final_ptids})

labels = pd.read_csv(r'../../raw_data/clinical_data.csv')
labels = labels[["PTID", "DX_bl"]].dropna(axis=0).drop_duplicates()

intxn_ptid_labels = labels.merge(final_ptids, how='inner', on="PTID")
# pd.unique(intxn_ptid_labels["PTID"]).shape[0] will return 681; all patients in intersection have at least one valid diagnosis


In [8]:
intxn_ptid_labels

Unnamed: 0,PTID,DX_bl
0,011_S_0021,CN
1,011_S_0023,CN
2,100_S_0035,CN
3,023_S_0042,LMCI
4,100_S_0047,CN
...,...,...
637,053_S_4578,CN
638,135_S_4598,CN
639,012_S_4643,CN
640,135_S_4676,AD


In [9]:
# unaccounted_metab_rids = set()
# 
# cnv_wells = cnv.columns[6:]
# expr_ptids = expr.columns[2:]
# metab_rids = metab["RID"].to_numpy()
# 
# for i in range(cnv_wells.shape[0]):
#     cnv_ptids.add(well_dict[cnv_wells[i]])
# 
# for i in range(metab_rids.shape[0]):
#     if metab_rids[i] in rid_dict.keys():
#         metab_ptids.add(rid_dict[metab_rids[i]])
#     else:
#         unaccounted_metab_rids.add(metab_rids[i])

In [10]:
# save PTIDs with different diagnoses
# dupe_idx = intxn_ptid_labels[["PTID"]].duplicated(keep=False)  # indices of duplicate entries
# dupe_idx

#### Transpose data and compress feature names

Expression data

In [11]:
idx_to_sym = {k[0]: k[1] for k in zip(expr_new.index, expr_new["Symbol"])}
expr_t = expr_new.transpose().rename(columns=idx_to_sym).drop(labels=["Probeid", "Symbol"], axis=0)

expr_t["PTID"] = expr_t.index
cols = expr_t.columns.tolist()
cols.remove("PTID")
cols.insert(0, "PTID")
expr_t = expr_t[cols]
del cols

expr_t = expr_t.merge(intxn_ptid_labels[["PTID"]], how='inner', on="PTID")
expr_t = expr_t.reset_index().drop(labels='index', axis=1)


Copy number variation data

In [52]:
# TODO; retain feature names in final dataset

cnv_t = cnv_new.transpose()
cnv_t = cnv_t.rename(columns={idx: "cnv_" + str(idx) + "_" + cnv_new["QUAL"].loc[idx] for idx in cnv_new.index})

cnv_t["PTID"] = cnv_t.index
cols = cnv_t.columns.tolist()
cols.remove("PTID")
cols.insert(0, "PTID")
cnv_t = cnv_t[cols]
del cols

cnv_t_data = cnv_t[6:].merge(intxn_ptid_labels[["PTID"]], how='inner', on="PTID")
cnv_t_key = cnv_t[:6].drop(columns=["PTID"])

cnv_t_data = cnv_t_data.reset_index().drop(labels='index', axis=1)

In [46]:
cnv_t_data.to_csv(r'../../data/cnv.csv')

Metabolomics data

In [13]:
metab_final = metab_new.merge(intxn_ptid_labels[["PTID"]], how='inner', on="PTID")

#### Save data

In [14]:
expr_t.to_csv(r'../../data/gene_expression.csv')
cnv_t_data.to_csv(r'../../data/cnv.csv')
metab_final.to_csv(r'../../data/metabolomics.csv')
intxn_ptid_labels.to_csv(r'../../data/labels.csv')

cnv_t_key.to_csv(r'../../metadata/cnv_key.csv')

In [56]:
# 221 are CN
# 36 are AD
# 193 are LMCI
# 192 are EMCI

intxn_ptid_labels[intxn_ptid_labels["DX_bl"] == "AD"].shape

(36, 2)

In [41]:
cnv_ptids_df = pd.DataFrame({"PTID": list(cnv_ptids)})

biomics = (labels.merge(metab_new[["PTID"]], how='inner', on="PTID")).merge(cnv_ptids_df[["PTID"]], how='inner', on="PTID")
biomics[biomics["DX_bl"] == "AD"].shape


(43, 2)