In [3]:
import pandas as pd
import numpy as np
import os
import EpiClockInvasiveBRCA.src.util as epi_util
from EpiClockInvasiveBRCA.src.consts import consts

In [11]:
getSample = lambda x:'-'.join(x.split('-')[:4])

In [4]:
TCGA_clinical_dir = os.path.join(consts['official_indir'], 'TCGA')

In [23]:
with open(os.path.join(TCGA_clinical_dir, 'header_methyl.txt'), 'r') as f:
    line = f.readline()
    sample_list = line.rstrip('\n').replace('"', '').split('\t')
    methyl_samples = np.array([getSample(samp) for samp in sample_list])
    
with open(os.path.join(TCGA_clinical_dir, 'header_rna.txt'), 'r') as f:
    line = f.readline()
    sample_list = line.rstrip('\n').replace('"', '').split('\t')
    rna_samples = np.array([getSample(samp) for samp in sample_list])

In [5]:
CNV_df = pd.read_table(
    os.path.join(TCGA_clinical_dir, 'cohort1.cnv.tsv'),
    sep='\t')

In [33]:
CNV_samples = np.unique(CNV_df['Sample'].apply(getSample).values)

In [42]:
methyl_ser = pd.Series(data=methyl_samples, index=[epi_util.sampleToPatientID(samp) for samp in methyl_samples]).rename('methyl')
rna_ser = pd.Series(data=rna_samples, index=[epi_util.sampleToPatientID(samp) for samp in rna_samples]).rename('rna')
CNV_ser = pd.Series(data=CNV_samples, index=[epi_util.sampleToPatientID(samp) for samp in CNV_samples]).rename('CNV')

In [43]:
assert not methyl_ser.index.duplicated().any()
assert not rna_ser.index.duplicated().any()
assert not CNV_ser.index.duplicated().any()

In [58]:
combined = pd.concat([methyl_ser, rna_ser], axis=1).dropna()
print(combined.shape)
assert (combined['methyl'] == combined['rna']).all()

combined = pd.concat([methyl_ser, CNV_ser], axis=1).dropna()
print(combined.shape)
assert (combined['methyl'] == combined['CNV']).all()

combined = pd.concat([CNV_ser, rna_ser], axis=1).dropna()
print(combined.shape)
assert (combined['CNV'] == combined['rna']).all()

(408, 2)
(407, 2)
(407, 2)


In [62]:
methyl_ser.rename('sampleID').to_csv(os.path.join(TCGA_clinical_dir, 'patient_to_sample_IDs.txt'), sep='\t')