In [1]:
import os
import pandas as pd

# Data loading

In [2]:
data_dir = "../../data"

## Corpus

In [3]:
df_corpus = pd.read_csv(
    os.path.join(data_dir, "corpus", "TCGA_Reports.csv"),
    sep=',',
    header=0
)

In [4]:
df_corpus.shape

(9523, 2)

In [5]:
df_corpus.head()

Unnamed: 0,patient_filename,text
0,TCGA-BP-5195.25c0b433-5557-4165-922e-2c1eac9c26f0,Date of Recelpt: Clinical Diagnosis & History:...
1,TCGA-D7-8573.b7306a47-697d-4ed3-bbe1-81d49674a8f8,"Material: 1) Material: stomach, Method of coll..."
2,TCGA-EI-7004.13591eed-30e5-47a3-91be-7a370663d2d4,page 1 / 1. copy No. 3. Examination: Histopath...
3,TCGA-EB-A82B.23E186C6-739C-4EF1-8788-79AA89C6E87A,Patient ID: Gross Description: A mass is locat...
4,TCGA-A6-3808.e1505f65-72ef-438d-a5e1-93ed8bf6635d,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...


Extract TCGA patient IDs:

In [6]:
df_corpus["patient_id"] = df_corpus["patient_filename"].apply(lambda x: x.split('.')[0])

In [7]:
assert not df_corpus["patient_filename"].duplicated().any()

In [8]:
assert not df_corpus["patient_id"].duplicated().any()

In [9]:
df_corpus = df_corpus[["patient_id", "text"]]

In [10]:
df_corpus.index = df_corpus["patient_id"].values

In [11]:
df_corpus.head()

Unnamed: 0,patient_id,text
TCGA-BP-5195,TCGA-BP-5195,Date of Recelpt: Clinical Diagnosis & History:...
TCGA-D7-8573,TCGA-D7-8573,"Material: 1) Material: stomach, Method of coll..."
TCGA-EI-7004,TCGA-EI-7004,page 1 / 1. copy No. 3. Examination: Histopath...
TCGA-EB-A82B,TCGA-EB-A82B,Patient ID: Gross Description: A mass is locat...
TCGA-A6-3808,TCGA-A6-3808,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...


## TNM staging

### Tumor size (T)

In [12]:
df_tnm_t = pd.read_csv(
    os.path.join(data_dir, "tnm_stage", "TCGA_T14_patients.csv"),
    sep=',',
    header=0
)

In [13]:
df_tnm_t.shape

(6966, 4)

In [14]:
df_tnm_t.head()

Unnamed: 0,ajcc_pathologic_t,project_id,case_id,case_submitter_id
0,T3,TCGA-BRCA,3af31fcf-ad0c-4fd9-a8e3-10f9176b5e9d,TCGA-AR-A5QQ
1,T1,TCGA-THCA,b0cb6ed7-3d9f-4d5e-8df5-edcc921c43b4,TCGA-FY-A2QD
2,T1,TCGA-BRCA,b5fba77b-1f50-4e71-95b2-566afba4bdd7,TCGA-A2-A04Q
3,T2,TCGA-THCA,ca65a38e-8b49-4b50-908a-478b95b95c02,TCGA-EL-A3GW
4,T2,TCGA-LUSC,3d6a027b-dfda-42b3-9323-407e42cf6d10,TCGA-66-2737


In [15]:
df_tnm_t.rename(
    columns={"case_submitter_id": "patient_id"},
    inplace=True
)

In [16]:
assert not df_tnm_t["patient_id"].duplicated().any()

In [17]:
df_tnm_t.index = df_tnm_t["patient_id"].values

In [18]:
df_tnm_t = df_tnm_t[["patient_id", "ajcc_pathologic_t"]]

In [19]:
df_tnm_t.head()

Unnamed: 0,patient_id,ajcc_pathologic_t
TCGA-AR-A5QQ,TCGA-AR-A5QQ,T3
TCGA-FY-A2QD,TCGA-FY-A2QD,T1
TCGA-A2-A04Q,TCGA-A2-A04Q,T1
TCGA-EL-A3GW,TCGA-EL-A3GW,T2
TCGA-66-2737,TCGA-66-2737,T2


In [20]:
len(set(df_tnm_t["ajcc_pathologic_t"].values))

24

In [21]:
df_tnm_t["ajcc_pathologic_t"].value_counts(dropna=False)

ajcc_pathologic_t
T2      1722
T3      1378
T1       628
T3a      441
T1b      429
T1a      363
T4a      311
T3b      289
T2a      250
T4       228
T1c      218
T2b      197
T2c      156
T4b      147
TX        76
T1b1      71
T1b2      30
T2a2      11
T2a1       7
T4d        5
T3c        4
T0         3
T4c        1
T4e        1
Name: count, dtype: int64

#### Preprocess labels

In [22]:
def label_t_stage(t_value):
    # Extract the first digit after 'T'
    if t_value.startswith('T') and len(t_value) > 1:
        first_char = t_value[1]
        if first_char in ['1', '2', '3', '4']:
            return t_value[:2]
    return None

In [23]:
df_tnm_t["t_label"] = df_tnm_t["ajcc_pathologic_t"].apply(label_t_stage)

In [24]:
df_tnm_t.shape

(6966, 3)

In [25]:
df_tnm_t = df_tnm_t[df_tnm_t["t_label"].notna()]

In [26]:
df_tnm_t.shape

(6887, 3)

In [27]:
df_tnm_t["t_label"].value_counts(dropna=False)

t_label
T2    2343
T3    2112
T1    1739
T4     693
Name: count, dtype: int64

### Regional lymph node involvement (N)

In [28]:
df_tnm_n = pd.read_csv(
    os.path.join(data_dir, "tnm_stage", "TCGA_N03_patients.csv"),
    sep=',',
    header=0
)

In [29]:
df_tnm_n.shape

(5678, 4)

In [30]:
df_tnm_n.head()

Unnamed: 0,ajcc_pathologic_n,project_id,case_id,case_submitter_id
0,N1,TCGA-BRCA,3af31fcf-ad0c-4fd9-a8e3-10f9176b5e9d,TCGA-AR-A5QQ
1,N0,TCGA-THCA,b0cb6ed7-3d9f-4d5e-8df5-edcc921c43b4,TCGA-FY-A2QD
2,N0,TCGA-THCA,ca65a38e-8b49-4b50-908a-478b95b95c02,TCGA-EL-A3GW
3,N1,TCGA-LUSC,3d6a027b-dfda-42b3-9323-407e42cf6d10,TCGA-66-2737
4,N0,TCGA-BLCA,ccd65bc8-82ef-453e-b4bc-a005cc2262d5,TCGA-ZF-AA4V


In [31]:
df_tnm_n.rename(
    columns={"case_submitter_id": "patient_id"},
    inplace=True
)

In [32]:
assert not df_tnm_n["patient_id"].duplicated().any()

In [33]:
df_tnm_n.index = df_tnm_n["patient_id"].values

In [34]:
df_tnm_n = df_tnm_n[["patient_id", "ajcc_pathologic_n"]]

In [35]:
df_tnm_n.head()

Unnamed: 0,patient_id,ajcc_pathologic_n
TCGA-AR-A5QQ,TCGA-AR-A5QQ,N1
TCGA-FY-A2QD,TCGA-FY-A2QD,N0
TCGA-EL-A3GW,TCGA-EL-A3GW,N0
TCGA-66-2737,TCGA-66-2737,N1
TCGA-ZF-AA4V,TCGA-ZF-AA4V,N0


In [36]:
len(set(df_tnm_n["ajcc_pathologic_n"].values))

13

In [37]:
df_tnm_n["ajcc_pathologic_n"].value_counts(dropna=False)

ajcc_pathologic_n
N0     3329
N1     1056
N2      434
N1a     276
N2b     127
N1b     123
N3       97
N3a      92
N2a      84
N2c      48
N3b       6
N1c       5
N3c       1
Name: count, dtype: int64

#### Preprocess labels

In [38]:
def label_n_stage(n_value):
    # Extract the first digit after 'N'
    if n_value.startswith('N') and len(n_value) > 1:
        first_char = n_value[1]
        if first_char in ['0', '1', '2', '3']:
            return n_value[:2]
    return None

In [39]:
df_tnm_n["n_label"] = df_tnm_n["ajcc_pathologic_n"].apply(label_n_stage)

In [40]:
df_tnm_n.shape

(5678, 3)

In [41]:
df_tnm_n = df_tnm_n[df_tnm_n["n_label"].notna()]

In [42]:
df_tnm_n.shape

(5678, 3)

In [43]:
df_tnm_n["n_label"].value_counts(dropna=False)

n_label
N0    3329
N1    1460
N2     693
N3     196
Name: count, dtype: int64

### Distant metastasis (M)

In [44]:
df_tnm_m = pd.read_csv(
    os.path.join(data_dir, "tnm_stage", "TCGA_M01_patients.csv"),
    sep=',',
    header=0
)

In [45]:
df_tnm_m.shape

(4608, 4)

In [46]:
df_tnm_m.head()

Unnamed: 0,ajcc_pathologic_m,project_id,case_id,case_submitter_id
0,M0,TCGA-BRCA,3af31fcf-ad0c-4fd9-a8e3-10f9176b5e9d,TCGA-AR-A5QQ
1,M0,TCGA-BRCA,b5fba77b-1f50-4e71-95b2-566afba4bdd7,TCGA-A2-A04Q
2,M0,TCGA-THCA,ca65a38e-8b49-4b50-908a-478b95b95c02,TCGA-EL-A3GW
3,M0,TCGA-LUSC,3d6a027b-dfda-42b3-9323-407e42cf6d10,TCGA-66-2737
4,M0,TCGA-BLCA,ccd65bc8-82ef-453e-b4bc-a005cc2262d5,TCGA-ZF-AA4V


In [47]:
df_tnm_m.rename(
    columns={"case_submitter_id": "patient_id"},
    inplace=True
)

In [48]:
assert not df_tnm_m["patient_id"].duplicated().any()

In [49]:
df_tnm_m.index = df_tnm_m["patient_id"].values

In [50]:
df_tnm_m = df_tnm_m[["patient_id", "ajcc_pathologic_m"]]

In [51]:
df_tnm_m.head()

Unnamed: 0,patient_id,ajcc_pathologic_m
TCGA-AR-A5QQ,TCGA-AR-A5QQ,M0
TCGA-A2-A04Q,TCGA-A2-A04Q,M0
TCGA-EL-A3GW,TCGA-EL-A3GW,M0
TCGA-66-2737,TCGA-66-2737,M0
TCGA-ZF-AA4V,TCGA-ZF-AA4V,M0


In [52]:
len(set(df_tnm_m["ajcc_pathologic_m"].values))

5

In [53]:
df_tnm_m["ajcc_pathologic_m"].value_counts(dropna=False)

ajcc_pathologic_m
M0     4295
M1      283
M1a      19
M1b      10
M1c       1
Name: count, dtype: int64

#### Preprocess labels

In [54]:
def label_m_stage(m_value):
    # Extract the first digit after 'M'
    if m_value.startswith('M') and len(m_value) > 1:
        first_char = m_value[1]
        if first_char in ['0', '1']:
            return m_value[:2]
    return None

In [55]:
df_tnm_m["m_label"] = df_tnm_m["ajcc_pathologic_m"].apply(label_m_stage)

In [56]:
df_tnm_m.shape

(4608, 3)

In [57]:
df_tnm_m = df_tnm_m[df_tnm_m["m_label"].notna()]

In [58]:
df_tnm_m.shape

(4608, 3)

In [59]:
df_tnm_m["m_label"].value_counts(dropna=False)

m_label
M0    4295
M1     313
Name: count, dtype: int64

# Data merging

We combine all datasets into a single dataframe.

In [66]:
df_tnm_t.shape, df_tnm_n.shape, df_tnm_m.shape

((6887, 3), (5678, 3), (4608, 3))

We select patients annotated with T, N and M labels:

In [72]:
arr_patient_id = sorted(
    set(df_tnm_t["patient_id"]).intersection(
        set(df_tnm_n["patient_id"])
    ).intersection(
        set(df_tnm_m["patient_id"])
    )
)

In [73]:
len(arr_patient_id)

3898

In [74]:
assert pd.Series(arr_patient_id).isin(df_corpus["patient_id"]).all()

In [75]:
df_corpus_tnm = df_corpus.loc[arr_patient_id].copy()

In [78]:
df_corpus_tnm["t_label"] = df_tnm_t.loc[arr_patient_id]["t_label"].values
df_corpus_tnm["n_label"] = df_tnm_n.loc[arr_patient_id]["n_label"].values
df_corpus_tnm["m_label"] = df_tnm_m.loc[arr_patient_id]["m_label"].values

In [79]:
df_corpus_tnm.shape

(3898, 5)

In [80]:
df_corpus_tnm.head()

Unnamed: 0,patient_id,text,t_label,n_label,m_label
TCGA-05-4244,TCGA-05-4244,Diagnosis: 1. Resection material from the righ...,T2,N2,M1
TCGA-05-4245,TCGA-05-4245,Diagnosis: 1. Atypical resection material from...,T2,N2,M0
TCGA-05-4249,TCGA-05-4249,Diagnosis: 2. and 3. Two-part resection materi...,T2,N0,M0
TCGA-05-4250,TCGA-05-4250,2. and 3. Resection material from the right lo...,T3,N1,M0
TCGA-05-4382,TCGA-05-4382,Main diagnosis/diagnoses: Primarily evidently ...,T2,N0,M0


In [83]:
df_corpus_tnm.to_csv(
    os.path.join(data_dir, "tnm_stage", "tcga_reports_tnm_stage.csv"),
    sep=',',
    index=False,
    header=True
)