In [1]:
import os
import pandas as pd

# Data loading

In [2]:
data_dir = "../../data"

## Corpus

In [3]:
df_corpus = pd.read_csv(
    os.path.join(data_dir, "corpus", "TCGA_Reports.csv"),
    sep=',',
    header=0
)

In [4]:
df_corpus.shape

(9523, 2)

In [5]:
df_corpus.head()

Unnamed: 0,patient_filename,text
0,TCGA-BP-5195.25c0b433-5557-4165-922e-2c1eac9c26f0,Date of Recelpt: Clinical Diagnosis & History:...
1,TCGA-D7-8573.b7306a47-697d-4ed3-bbe1-81d49674a8f8,"Material: 1) Material: stomach, Method of coll..."
2,TCGA-EI-7004.13591eed-30e5-47a3-91be-7a370663d2d4,page 1 / 1. copy No. 3. Examination: Histopath...
3,TCGA-EB-A82B.23E186C6-739C-4EF1-8788-79AA89C6E87A,Patient ID: Gross Description: A mass is locat...
4,TCGA-A6-3808.e1505f65-72ef-438d-a5e1-93ed8bf6635d,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...


Extract TCGA patient IDs:

In [6]:
df_corpus["patient_id"] = df_corpus["patient_filename"].apply(lambda x: x.split('.')[0])

In [7]:
assert not df_corpus["patient_filename"].duplicated().any()

In [8]:
assert not df_corpus["patient_id"].duplicated().any()

In [9]:
df_corpus = df_corpus[["patient_id", "text"]]

In [10]:
df_corpus.index = df_corpus["patient_id"].values

In [11]:
df_corpus.head()

Unnamed: 0,patient_id,text
TCGA-BP-5195,TCGA-BP-5195,Date of Recelpt: Clinical Diagnosis & History:...
TCGA-D7-8573,TCGA-D7-8573,"Material: 1) Material: stomach, Method of coll..."
TCGA-EI-7004,TCGA-EI-7004,page 1 / 1. copy No. 3. Examination: Histopath...
TCGA-EB-A82B,TCGA-EB-A82B,Patient ID: Gross Description: A mass is locat...
TCGA-A6-3808,TCGA-A6-3808,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...


## Cancer type

In [12]:
df_cancer_type = pd.read_csv(
    os.path.join(data_dir, "cancer_type", "tcga_patient_to_cancer_type.csv"),
    sep=',',
    header=0
)

In [13]:
df_cancer_type.shape

(11160, 2)

In [14]:
df_cancer_type.head()

Unnamed: 0,patient_id,cancer_type
0,TCGA-OR-A5J1,ACC
1,TCGA-OR-A5J2,ACC
2,TCGA-OR-A5J3,ACC
3,TCGA-OR-A5J4,ACC
4,TCGA-OR-A5J5,ACC


In [15]:
assert not df_cancer_type["patient_id"].duplicated().any()

In [16]:
df_cancer_type.index = df_cancer_type["patient_id"].values

In [17]:
df_cancer_type.head()

Unnamed: 0,patient_id,cancer_type
TCGA-OR-A5J1,TCGA-OR-A5J1,ACC
TCGA-OR-A5J2,TCGA-OR-A5J2,ACC
TCGA-OR-A5J3,TCGA-OR-A5J3,ACC
TCGA-OR-A5J4,TCGA-OR-A5J4,ACC
TCGA-OR-A5J5,TCGA-OR-A5J5,ACC


# Data merging

We combine both datasets into a single dataframe:

In [18]:
assert df_corpus["patient_id"].isin(df_cancer_type["patient_id"]).all()

In [19]:
df_corpus["cancer_type"] = df_cancer_type.loc[df_corpus.index, "cancer_type"]

In [20]:
df_corpus.head()

Unnamed: 0,patient_id,text,cancer_type
TCGA-BP-5195,TCGA-BP-5195,Date of Recelpt: Clinical Diagnosis & History:...,KIRC
TCGA-D7-8573,TCGA-D7-8573,"Material: 1) Material: stomach, Method of coll...",STAD
TCGA-EI-7004,TCGA-EI-7004,page 1 / 1. copy No. 3. Examination: Histopath...,READ
TCGA-EB-A82B,TCGA-EB-A82B,Patient ID: Gross Description: A mass is locat...,SKCM
TCGA-A6-3808,TCGA-A6-3808,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...,COAD


We add a column with the full name of the cancer type. This information was downloaded from here: https://github.com/KatherLab/cancer-metadata/blob/main/tcga/tcga-tumor-types.csv

In [21]:
df_cancer_name = pd.read_csv(
    os.path.join(data_dir, "cancer_type", "tcga-tumor-types.csv"),
    sep=';',
    header=0
)

In [22]:
df_cancer_name.shape

(37, 2)

In [23]:
df_cancer_name.head()

Unnamed: 0,Study Abbreviation,Study Name
0,LAML,Acute Myeloid Leukemia
1,ACC,Adrenocortical carcinoma
2,BLCA,Bladder Urothelial Carcinoma
3,LGG,Brain Lower Grade Glioma
4,BRCA,Breast invasive carcinoma


In [24]:
# Convert DataFrame into dictionary
assert not df_cancer_name["Study Abbreviation"].duplicated().any()
df_cancer_name.index = df_cancer_name["Study Abbreviation"].values
dict_cancer_name = df_cancer_name["Study Name"].to_dict()

In [25]:
assert df_corpus["cancer_type"].isin(list(dict_cancer_name.keys())).all()

In [26]:
df_corpus["cancer_type_name"] = df_corpus["cancer_type"].apply(lambda x: dict_cancer_name[x])

In [27]:
df_corpus.to_csv(
    os.path.join(data_dir, "cancer_type", "tcga_reports_cancer_type.csv"),
    sep=',',
    index=False,
    header=True
)