# Download the clinical data

In [1]:
!wget https://cbioportal-datahub.s3.amazonaws.com/ucec_tcga.tar.gz

--2024-06-03 13:33:59--  https://cbioportal-datahub.s3.amazonaws.com/ucec_tcga.tar.gz
Resolving cbioportal-datahub.s3.amazonaws.com (cbioportal-datahub.s3.amazonaws.com)... 52.216.205.123, 52.217.70.92, 52.216.250.60, ...
Connecting to cbioportal-datahub.s3.amazonaws.com (cbioportal-datahub.s3.amazonaws.com)|52.216.205.123|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120355875 (115M) [application/x-tar]
Saving to: ‘ucec_tcga.tar.gz’


2024-06-03 13:34:48 (2.38 MB/s) - ‘ucec_tcga.tar.gz’ saved [120355875/120355875]



In [2]:
import shutil
shutil.unpack_archive("./ucec_tcga.tar.gz", "./")

# Match slides

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
ca_type = "TCGA_UCEC"

slide_folder = f"/media/nfs/SURV/{ca_type}/Slides"
patch_folder = f"/media/nfs/SURV/{ca_type}/SP1024/patches"
feat_folder = f"/media/nfs/SURV/{ca_type}/Feats1024/"

print("Total Nb of Slides/Cases: ", len(os.listdir(slide_folder)), len(np.unique([i[:12] for i in os.listdir(slide_folder)])))
print("Total Nb of Patches/Cases: ", len(os.listdir(patch_folder)), len(np.unique([i[:12] for i in os.listdir(patch_folder)])))
# print("Total Nb of Features: ", [len(os.listdir(feat_folder+fe)) for fe in os.listdir(feat_folder)])
# feat_folder = feat_folder+"CTP/"

Total Nb of Slides/Cases:  1315 548
Total Nb of Patches/Cases:  1309 548


In [3]:
df = pd.read_csv("./ucec_tcga/data_clinical_patient.txt", header=4, sep="\t")

def change_na_to_pd_na(texts, df):
    df_copy = df.copy()
    for col in df_copy.columns:
        df_copy[col] = df_copy[col].replace(texts, pd.NA)
    return df_copy

df = change_na_to_pd_na(['[Not Available]', '[Not Applicable]', '[Unknown]', '[Discrepancy]'], df)
print("All clinical data: ", len(df))
print("Missing follow-up data: ", len(df.index[df["OS_MONTHS"].isna()]))

df = df.drop(df.index[df["OS_MONTHS"].isna()]).reset_index(drop=True)
print("Available follow-up data:", len(df))

All clinical data:  548
Missing follow-up data:  1
Available follow-up data: 547


In [6]:
df["OS_MONTHS"] = df["OS_MONTHS"].astype(float)
df["OS_MONTHS"][df["OS_MONTHS"]<0]

185   -0.2
Name: OS_MONTHS, dtype: float64

In [7]:
df["OS_MONTHS"][df["OS_MONTHS"]<0] = pd.NA
df = df.drop(df.index[df["OS_MONTHS"].isna()]).reset_index(drop=True)
print("Available follow-up data:", len(df))

Available follow-up data: 546


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["OS_MONTHS"][df["OS_MONTHS"]<0] = pd.NA


In [9]:
df["DFS_MONTHS"].dropna().astype(float).describe()

count    505.000000
mean      34.752693
std       28.558460
min        0.070000
25%       15.640000
50%       25.990000
75%       49.110000
max      225.330000
Name: DFS_MONTHS, dtype: float64

In [10]:
files = [i.rsplit(".", 1)[0] for i in os.listdir(slide_folder)]
cases = np.unique([i[:12] for i in files])
print("Total files/cases: ", len(files), len(cases))

dx_files = [i for i in files if i[20:22] == "DX"]
tx_files = [i for i in files if i[20:22] != "DX"]
dx_cases = np.unique([i[:12] for i in dx_files])
tx_cases = np.unique([i[:12] for i in tx_files])
print("\tDiagnostic slides/cases: ", len(dx_files), len(dx_cases))
print("\tTissue slides/cases: ", len(tx_files), len(tx_cases))

files = [i.rsplit(".", 1)[0] for i in os.listdir(patch_folder)]
cases = np.unique([i[:12] for i in files])
print("Total files/cases: ", len(files), len(cases))

dx_files = [i for i in files if i[20:22] == "DX"]
tx_files = [i for i in files if i[20:22] != "DX"]
dx_cases = np.unique([i[:12] for i in dx_files])
tx_cases = np.unique([i[:12] for i in tx_files])
print("\tDiagnostic slides/cases: ", len(dx_files), len(dx_cases))
print("\tTissue slides/cases: ", len(tx_files), len(tx_cases))

cases = [i for i in cases if i in df["PATIENT_ID"].values]
files = [i for i in files if i[:12] in cases]
print("Slides with available follow-up files/cases: ", len(files), len(cases))

dx_files = [i for i in files if i[20:22] == "DX"]
tx_files = [i for i in files if i[20:22] != "DX"]
dx_cases = np.unique([i[:12] for i in dx_files])
tx_cases = np.unique([i[:12] for i in tx_files])
print("\tDiagnostic slides/cases: ", len(dx_files), len(dx_cases))
print("\tTissue slides/cases: ", len(tx_files), len(tx_cases))

Total files/cases:  1315 548
	Diagnostic slides/cases:  565 505
	Tissue slides/cases:  750 548
Total files/cases:  1309 548
	Diagnostic slides/cases:  565 505
	Tissue slides/cases:  744 545
Slides with available follow-up files/cases:  1306 546
	Diagnostic slides/cases:  564 504
	Tissue slides/cases:  742 543


In [11]:
drop_ids = [i for i in df["PATIENT_ID"].values if i not in cases]
df = df[~df["PATIENT_ID"].isin(drop_ids)]
print(df.shape)
drop_ids

(546, 68)


[]

In [12]:
sample_df = pd.read_csv("./ucec_tcga/data_clinical_patient.txt", sep="\t", header=4)

sample_df = sample_df[sample_df["PATIENT_ID"].isin(df["PATIENT_ID"].values)]
other_site_tm = sample_df["PATIENT_ID"][sample_df["SITE_OF_TUMOR_TISSUE"] != "Endometrial"].values
print(len(other_site_tm))
df = df[~df["PATIENT_ID"].isin(other_site_tm)]
print("After excluding other sites (files/cases): ")

dx_files = [i for i in dx_files if i[:12] in df["PATIENT_ID"].values]
tx_files = [i for i in tx_files if i[:12] in df["PATIENT_ID"].values]
dx_cases = np.unique([i[:12] for i in dx_files])
tx_cases = np.unique([i[:12] for i in tx_files])
print("\tTotal (files/cases): ", len(dx_files+tx_files), len(df))
print("\tdx (files/cases): ", len(dx_files), len(dx_cases))
print("\ttx (files/cases): ", len(tx_files), len(tx_cases))
sample_df[["PATIENT_ID", "SITE_OF_TUMOR_TISSUE"]][sample_df["SITE_OF_TUMOR_TISSUE"] != "Endometrial"]

1
After excluding other sites (files/cases): 
	Total (files/cases):  1304 545
	dx (files/cases):  563 503
	tx (files/cases):  741 542


Unnamed: 0,PATIENT_ID,SITE_OF_TUMOR_TISSUE
543,TCGA-QS-A8F1,Other Specify


In [13]:
# Remaining Slides
dx_tx_cases = [i for i in dx_cases if i in tx_cases]
only_dx_cases = [i for i in dx_cases if i not in tx_cases]
only_tx_cases = [i for i in tx_cases if i not in dx_cases]

only_dx_files = [i for i in dx_files if i[:12] in only_dx_cases]
dx_tx_files = [i for i in dx_files+tx_files if i[:12] in dx_tx_cases]
only_tx_files = [i for i in tx_files if i[:12] in only_tx_cases]
print("dx+tx (files/cases/ratio):", len(dx_tx_files), len(dx_tx_cases), round(len(dx_tx_cases)/(len(dx_tx_cases)+len(only_tx_cases)+len(only_dx_cases)), 2))
print("Only tx (files/cases/ratio):", len(only_tx_files), len(only_tx_cases), round(len(only_tx_cases)/(len(dx_tx_cases)+len(only_tx_cases)+len(only_dx_cases)), 2))
print("Only dx (files/cases/ratio):", len(only_dx_files), len(only_dx_cases), round(len(only_dx_cases)/(len(dx_tx_cases)+len(only_tx_cases)+len(only_dx_cases)), 2))

dx+tx (files/cases/ratio): 1256 500 0.92
Only tx (files/cases/ratio): 45 42 0.08
Only dx (files/cases/ratio): 3 3 0.01


In [14]:
dx_tx_df = pd.DataFrame({"slide_id": dx_tx_files})
dx_tx_df["case_id"] = dx_tx_df["slide_id"].str[:12]
dx_tx_df = pd.merge(dx_tx_df, df.rename(columns={"PATIENT_ID": "case_id"}), on="case_id")

only_tx_df = pd.DataFrame({"slide_id": only_tx_files})
only_tx_df["case_id"] = only_tx_df["slide_id"].str[:12]
only_tx_df = pd.merge(only_tx_df, df.rename(columns={"PATIENT_ID": "case_id"}), on="case_id")

only_dx_df = pd.DataFrame({"slide_id": only_dx_files})
only_dx_df["case_id"] = only_dx_df["slide_id"].str[:12]
only_dx_df = pd.merge(only_dx_df, df.rename(columns={"PATIENT_ID": "case_id"}), on="case_id")
dx_tx_df.shape, only_tx_df.shape, only_dx_df.shape

((1256, 69), (45, 69), (3, 69))

In [15]:
dx_tx_df["group"] = np.zeros(len(dx_tx_df))
only_dx_df["group"] = np.zeros(len(only_dx_df))
only_tx_df["group"] = np.ones(len(only_tx_df))
df = pd.concat([dx_tx_df, only_tx_df, only_dx_df])
df.isna().sum()

slide_id                   0
case_id                    0
OTHER_PATIENT_ID           0
FORM_COMPLETION_DATE       0
PROSPECTIVE_COLLECTION    10
                          ..
OS_STATUS                  0
OS_MONTHS                  0
DFS_STATUS                85
DFS_MONTHS                85
group                      0
Length: 70, dtype: int64

In [16]:
# Drop columns with more than 50% missing values
drop_cols = [col for col in df.columns if df[col].isna().sum() > len(df)*.5]
print(len(drop_cols))
df.drop(columns=drop_cols, inplace=True)
df.shape

26


(1304, 44)

In [17]:
drop_cols

['HISTORY_MENOPAUSAL_HORMONE_THERAPY',
 'HISTORY_HORMONAL_CONTRACEPTIVES_USE',
 'HISTORY_TAMOXIFEN_USE',
 'HYPERTENSION_DIAGNOSIS',
 'DIABETES_DIAGNOSIS_INDICATOR',
 'PREGNANCIES_FULL_TERM_COUNT',
 'HISTORY_COLORECTAL_CANCER',
 'TREATMENT_OUTCOME_FIRST_COURSE',
 'RADIATION_TREATMENT_ADJUVANT',
 'PHARMACEUTICAL_TX_ADJUVANT',
 'PRIMARY_SITE_OTHER',
 'METHOD_OF_INITIAL_SAMPLE_PROCUREMENT_OTHER',
 'LYMPH_NODES_PELVIC_POS_BY_IHC',
 'LYMPH_NODES_AORTIC_POS_BY_IHC',
 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT',
 'CLIN_M_STAGE',
 'CLIN_N_STAGE',
 'CLIN_T_STAGE',
 'DISEASE_CODE',
 'EXTRANODAL_INVOLVEMENT',
 'PATH_M_STAGE',
 'PATH_N_STAGE',
 'PATH_T_STAGE',
 'AJCC_PATHOLOGIC_TUMOR_STAGE',
 'PROJECT_CODE',
 'STAGE_OTHER']

In [18]:
# Drop unrelated columns
drop_list = ['OTHER_PATIENT_ID', 'FORM_COMPLETION_DATE',
       'SEX', 'PROSPECTIVE_COLLECTION', 'RETROSPECTIVE_COLLECTION',
       'HISTOLOGICAL_DIAGNOSIS', 'INITIAL_PATHOLOGIC_DX_YEAR', 
       'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS',
       'ICD_10', 'ICD_O_3_HISTOLOGY',
       'ICD_O_3_SITE', 'INFORMED_CONSENT_VERIFIED', 'AJCC_STAGING_EDITION', "TISSUE_SOURCE_SITE"]
df = df.drop(drop_list, axis=1)
df.shape

(1304, 30)

# Inspect the clinical variables

In [19]:
df = df.rename(columns={'PATIENT_ID': 'case_id'}).reset_index(drop=True)
case_df = df.drop_duplicates(['case_id'])
df.shape, case_df.shape

((1304, 30), (545, 30))

## continous vars

In [20]:
print([(col, len(case_df[col].unique())) for col in case_df.columns if len(case_df[col].unique()) > 8])
cont_cols = [col for col in case_df.columns if len(case_df[col].unique()) > 8]
cont_cols

[('slide_id', 545), ('case_id', 545), ('HEIGHT', 43), ('WEIGHT', 107), ('AGE', 60), ('METHOD_OF_INITIAL_SAMPLE_PROCUREMENT', 9), ('TUMOR_INVASION_PERCENT', 112), ('LYMPH_NODES_PELVIC_EXAMINED_COUNT', 56), ('LYMPH_NODES_PELVIC_POS_BY_HE', 13), ('LYMPH_NODES_PELVIC_POS_TOTAL', 13), ('LYMPH_NODES_AORTIC_EXAMINED_COUNT', 31), ('LYMPH_NODES_AORTIC_POS_BY_HE', 12), ('LYMPH_NODES_AORTIC_POS_TOTAL', 13), ('CLINICAL_STAGE', 16), ('OS_MONTHS', 492), ('DFS_MONTHS', 451)]


['slide_id',
 'case_id',
 'HEIGHT',
 'WEIGHT',
 'AGE',
 'METHOD_OF_INITIAL_SAMPLE_PROCUREMENT',
 'TUMOR_INVASION_PERCENT',
 'LYMPH_NODES_PELVIC_EXAMINED_COUNT',
 'LYMPH_NODES_PELVIC_POS_BY_HE',
 'LYMPH_NODES_PELVIC_POS_TOTAL',
 'LYMPH_NODES_AORTIC_EXAMINED_COUNT',
 'LYMPH_NODES_AORTIC_POS_BY_HE',
 'LYMPH_NODES_AORTIC_POS_TOTAL',
 'CLINICAL_STAGE',
 'OS_MONTHS',
 'DFS_MONTHS']

In [21]:
# These give prospective information for survival
# does not align with the scope of this research
drop_cols = [
    'LYMPH_NODES_PELVIC_EXAMINED_COUNT',
    'LYMPH_NODES_PELVIC_POS_BY_HE',
    'LYMPH_NODES_PELVIC_POS_TOTAL',
    'LYMPH_NODES_AORTIC_EXAMINED_COUNT',
    'LYMPH_NODES_AORTIC_POS_BY_HE',
    'LYMPH_NODES_AORTIC_POS_TOTAL'
]
df.drop(columns=drop_cols, inplace=True)
case_df = df.drop_duplicates(subset=['case_id'])

In [22]:
[cont_cols.remove(i) for i in drop_cols+["slide_id", "case_id", "CLINICAL_STAGE", "METHOD_OF_INITIAL_SAMPLE_PROCUREMENT"]]
len(cont_cols)

6

In [23]:
for col in cont_cols:
    print(case_df[col].dropna().astype(float).describe())
    print()

count    516.000000
mean     161.205426
std        8.148170
min       66.000000
25%      157.000000
50%      161.000000
75%      166.000000
max      183.000000
Name: HEIGHT, dtype: float64

count    523.000000
mean      87.598470
std       25.464711
min       44.000000
25%       67.000000
50%       84.000000
75%      102.500000
max      209.000000
Name: WEIGHT, dtype: float64

count    543.000000
mean      63.895028
std       11.127823
min       31.000000
25%       57.000000
50%       64.000000
75%       71.000000
max       90.000000
Name: AGE, dtype: float64

count    473.000000
mean      41.790169
std       32.334646
min        0.000000
25%       13.000000
50%       40.000000
75%       60.000000
max      280.000000
Name: TUMOR_INVASION_PERCENT, dtype: float64

count    545.000000
mean      37.676752
std       29.783718
min        0.070000
25%       16.920000
50%       29.930000
75%       52.500000
max      225.330000
Name: OS_MONTHS, dtype: float64

count    505.000000
mean      34.7

In [24]:
height_df = df.dropna(subset=['HEIGHT'])
height_df["HEIGHT"] = height_df["HEIGHT"].astype(float)
height_df[height_df["HEIGHT"] < 140]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  height_df["HEIGHT"] = height_df["HEIGHT"].astype(float)


Unnamed: 0,slide_id,case_id,MENOPAUSE_STATUS,HEIGHT,WEIGHT,RACE,ETHNICITY,HISTORY_OTHER_MALIGNANCY,HISTORY_NEOADJUVANT_TRTYN,TUMOR_STATUS,...,TUMOR_INVASION_PERCENT,RESIDUAL_TUMOR,CLINICAL_STAGE,GRADE,SITE_OF_TUMOR_TISSUE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS,group
483,TCGA-BG-A0W1-01Z-00-DX1.8E06269C-1278-478B-A12...,TCGA-BG-A0W1,Post (prior bilateral ovariectomy OR >12 mo si...,130.0,56,WHITE,NOT HISPANIC OR LATINO,No,No,TUMOR FREE,...,45.0,R0,Stage II,G2,Endometrial,0:LIVING,52.79,0:DiseaseFree,52.79,0.0
484,TCGA-BG-A0W1-01A-01-TSA.b9956c37-9819-452f-880...,TCGA-BG-A0W1,Post (prior bilateral ovariectomy OR >12 mo si...,130.0,56,WHITE,NOT HISPANIC OR LATINO,No,No,TUMOR FREE,...,45.0,R0,Stage II,G2,Endometrial,0:LIVING,52.79,0:DiseaseFree,52.79,0.0
485,TCGA-BG-A0W1-01A-01-BSA.c7710ddc-5e2b-477c-b3b...,TCGA-BG-A0W1,Post (prior bilateral ovariectomy OR >12 mo si...,130.0,56,WHITE,NOT HISPANIC OR LATINO,No,No,TUMOR FREE,...,45.0,R0,Stage II,G2,Endometrial,0:LIVING,52.79,0:DiseaseFree,52.79,0.0
1084,TCGA-SJ-A6ZI-01Z-00-DX1.87B9D431-5067-4E93-911...,TCGA-SJ-A6ZI,,66.0,93,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,No,TUMOR FREE,...,75.0,R0,Stage IB,G1,Endometrial,0:LIVING,20.73,0:DiseaseFree,20.73,0.0
1085,TCGA-SJ-A6ZI-01A-01-TS1.F9CE9E77-0EF0-491F-AEA...,TCGA-SJ-A6ZI,,66.0,93,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,No,TUMOR FREE,...,75.0,R0,Stage IB,G1,Endometrial,0:LIVING,20.73,0:DiseaseFree,20.73,0.0
1300,TCGA-AX-A3FS-01A-01-TS1.27AE39D9-679D-42D4-AD3...,TCGA-AX-A3FS,Post (prior bilateral ovariectomy OR >12 mo si...,139.0,98,WHITE,NOT HISPANIC OR LATINO,No,No,,...,,,Stage IA,G3,Endometrial,0:LIVING,0.23,0:DiseaseFree,0.23,1.0


In [25]:
df["HEIGHT"][df["HEIGHT"] == '66'] = pd.NA
case_df = df.drop_duplicates(['case_id'])
case_df["HEIGHT"].dropna().astype(float).describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["HEIGHT"][df["HEIGHT"] == '66'] = pd.NA


count    515.000000
mean     161.390291
std        6.989507
min      130.000000
25%      157.000000
50%      161.000000
75%      166.000000
max      183.000000
Name: HEIGHT, dtype: float64

In [26]:
tm_df = df.dropna(subset=['TUMOR_INVASION_PERCENT'])
tm_df["TUMOR_INVASION_PERCENT"] = tm_df["TUMOR_INVASION_PERCENT"].astype(float)
tm_df["TUMOR_INVASION_PERCENT"][tm_df["TUMOR_INVASION_PERCENT"] > 100]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tm_df["TUMOR_INVASION_PERCENT"] = tm_df["TUMOR_INVASION_PERCENT"].astype(float)


355    280.0
356    280.0
Name: TUMOR_INVASION_PERCENT, dtype: float64

In [27]:
df["TUMOR_INVASION_PERCENT"][df["TUMOR_INVASION_PERCENT"] == '280'] = pd.NA
case_df = df.drop_duplicates(['case_id'])
case_df["TUMOR_INVASION_PERCENT"].dropna().astype(float).describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TUMOR_INVASION_PERCENT"][df["TUMOR_INVASION_PERCENT"] == '280'] = pd.NA


count    472.000000
mean      41.285487
std       30.446978
min        0.000000
25%       13.000000
50%       40.000000
75%       60.000000
max      100.000000
Name: TUMOR_INVASION_PERCENT, dtype: float64

## categorical vars

In [31]:
cat_cols = [col for col in case_df.columns if col not in cont_cols+["case_id", "slide_id"]]
len(cat_cols)

16

In [32]:
len(case_df)

545

In [33]:
for col in cat_cols:
    print()
    print(col)
    for i, j in dict(case_df[col].value_counts()).items():
        print(i, j, f"({int(round(j/len(case_df), 2)*100)}%)")


MENOPAUSE_STATUS
Post (prior bilateral ovariectomy OR >12 mo since LMP with no prior hysterectomy) 446 (82%)
Pre (<6 months since LMP AND no prior bilateral ovariectomy AND not on estrogen replacement) 35 (6%)
Indeterminate (neither Pre or Postmenopausal) 17 (3%)
Peri (6-12 months since last menstrual period) 17 (3%)

RACE
WHITE 372 (68%)
BLACK OR AFRICAN AMERICAN 108 (20%)
ASIAN 20 (4%)
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER 9 (2%)
AMERICAN INDIAN OR ALASKA NATIVE 4 (1%)

ETHNICITY
NOT HISPANIC OR LATINO 374 (69%)
HISPANIC OR LATINO 15 (3%)

HISTORY_OTHER_MALIGNANCY
No 498 (91%)
Yes 47 (9%)

HISTORY_NEOADJUVANT_TRTYN
No 543 (100%)
Yes 2 (0%)

TUMOR_STATUS
TUMOR FREE 430 (79%)
WITH TUMOR 78 (14%)

METHOD_OF_INITIAL_SAMPLE_PROCUREMENT
Office Endometrial Biopsy 320 (59%)
Dilation and curettage procedure 137 (25%)
Other method, specify: 33 (6%)
Tumor resection 30 (6%)
Cytology (e.g. Peritoneal or pleural fluid) 10 (2%)
Excisional Biopsy 6 (1%)
Incisional Biopsy 3 (1%)
Fine needle aspi

In [34]:
# Drop values with low frequency (only 3% are hispanic and 2 patient received neoadj tx)
# Tumor status was excluded as it was a follow-up observation rather than a primary observation
df.drop(["ETHNICITY", "HISTORY_NEOADJUVANT_TRTYN", "TUMOR_STATUS", "SITE_OF_TUMOR_TISSUE"], axis=1, inplace=True)
case_df = df.drop_duplicates(['case_id'])

In [35]:
df["MENOPAUSE_STATUS"].replace({
    "Post (prior bilateral ovariectomy OR >12 mo since LMP with no prior hysterectomy)": 1,
    "Pre (<6 months since LMP AND no prior bilateral ovariectomy AND not on estrogen replacement)": 0,
    "Indeterminate (neither Pre or Postmenopausal)": pd.NA,
    "Peri (6-12 months since last menstrual period)": pd.NA
}, inplace=True)
df.rename(columns={"MENOPAUSE_STATUS": "menopause"}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["menopause"].value_counts()


1    446
0     35
Name: menopause, dtype: int64

In [47]:
df["RACE"].replace({
    "BLACK OR AFRICAN AMERICAN": "BLACK_OR_AA",
    "AMERICAN INDIAN OR ALASKA NATIVE": pd.NA,
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER": pd.NA
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["RACE"].value_counts()

WHITE          372
BLACK_OR_AA    108
ASIAN           20
Name: RACE, dtype: int64

In [37]:
# As the grading system is 3 tier for endometrial ca
# G3 and G4 will be merged as high grade
# https://www.cancer.org/cancer/types/endometrial-cancer/about/what-is-endometrial-cancer.html

df["GRADE"].replace({
    "G1": 0,
    "G2": 1,
    "G3": 2,
    "G4": 2, 
    "GX": pd.NA,
    "GB": pd.NA,
    "High Grade": 2
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["GRADE"].value_counts()

2    324
1    122
0     99
Name: GRADE, dtype: int64

In [38]:
df["RESIDUAL_TUMOR"].replace({
    "R0": 0,
    "RX": pd.NA,
    "R1": 1,
    "R2": 2,
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["RESIDUAL_TUMOR"].value_counts()

0    375
1     22
2     16
Name: RESIDUAL_TUMOR, dtype: int64

In [39]:
# simplify stages
df["CLINICAL_STAGE"].replace({
    "Stage IIC": 1,
    "Stage IIB": 1,
    "Stage IIA2": 1,
    "Stage IIA1": 1,
    "Stage IIA": 1,
    "Stage II": 1,

    "Stage IC": 0,
    "Stage IB2": 0,
    "Stage IB1": 0,
    "Stage IB": 0,
    "Stage IA2": 0,
    "Stage IA1": 0,
    "Stage IA": 0,
    "Stage I": 0,

    "Stage III": 2,
    "Stage IIIC": 2,
    "Stage IIIC1": 2,
    "Stage IIIC2": 2,
    "Stage IIIB": 2,
    "Stage IIIA": 2,

    "Stage IV": 3,
    "Stage IVA": 3,
    "Stage IVB": 3,
}, inplace=True)

case_df = df.drop_duplicates(['case_id'])
case_df["CLINICAL_STAGE"].value_counts()

0    341
2    122
1     52
3     30
Name: CLINICAL_STAGE, dtype: int64

In [40]:
df["stage_binary"] = df["CLINICAL_STAGE"]
df["stage_binary"].replace({
    0: 0,
    1: 0,
    2: 1,
    3: 1
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["stage_binary"].value_counts()

0    393
1    152
Name: stage_binary, dtype: int64

In [41]:
df["METHOD_OF_INITIAL_SAMPLE_PROCUREMENT"].replace({
    'Tumor resection': "resection",
    'Other method, specify:': "other",
    'Fine needle aspiration biopsy': "other",
    'Cytology (e.g. Peritoneal or pleural fluid)': "other",
    'Incisional Biopsy': "other",
    'Excisional Biopsy': "excision",
    'Office Endometrial Biopsy': 'other',
    'Dilation and curettage procedure': 'other'
}, inplace=True)
df.rename(columns={"METHOD_OF_INITIAL_SAMPLE_PROCUREMENT": "biopsy"}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["biopsy"].value_counts()

other        504
resection     30
excision       6
Name: biopsy, dtype: int64

In [42]:
df["SURGICAL_APPROACH_AT_DIAGNOSIS"].replace({
    'Minimally Invasive': 0,
    "open": 1
}, inplace=True)
df.rename(columns={"SURGICAL_APPROACH_AT_DIAGNOSIS": "open_sx"}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["open_sx"].value_counts()

1    319
0    202
Name: open_sx, dtype: int64

In [43]:
df = df.replace({
    "1:DECEASED": 1,
    "0:LIVING": 0,
    "0:DiseaseFree": 0,
    "1:Recurred/Progressed": 1,
    "YES": 1,
    "NO": 0,
    "Yes": 1,
    "No": 0
})
df["OS_STATUS"].value_counts(), df["DFS_STATUS"].value_counts()

(0    1104
 1     200
 Name: OS_STATUS, dtype: int64,
 0    960
 1    259
 Name: DFS_STATUS, dtype: int64)

In [44]:
df.columns

Index(['slide_id', 'case_id', 'menopause', 'HEIGHT', 'WEIGHT', 'RACE',
       'HISTORY_OTHER_MALIGNANCY', 'AGE', 'biopsy', 'open_sx',
       'PERITONEAL_WASHING', 'TUMOR_INVASION_PERCENT', 'RESIDUAL_TUMOR',
       'CLINICAL_STAGE', 'GRADE', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS',
       'DFS_MONTHS', 'group', 'stage_binary'],
      dtype='object')

In [45]:
df["PERITONEAL_WASHING"].replace({
"negative": 0,
"positive": 1,
"[Not Evaluated]":pd.NA
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["PERITONEAL_WASHING"].value_counts()

0    352
1     58
Name: PERITONEAL_WASHING, dtype: int64

In [48]:
case_df = df.drop_duplicates(['case_id'])
for col in case_df.columns:
    if len(case_df[col].unique()) < 10:
        print()
        print(col)
        for i, j in dict(case_df[col].value_counts()).items():
            print(i, j, f"({int(round(j/len(case_df), 2)*100)}%)")


menopause
1 446 (82%)
0 35 (6%)

RACE
WHITE 372 (68%)
BLACK_OR_AA 108 (20%)
ASIAN 20 (4%)

HISTORY_OTHER_MALIGNANCY
0 498 (91%)
1 47 (9%)

biopsy
other 504 (92%)
resection 30 (6%)
excision 6 (1%)

open_sx
1 319 (59%)
0 202 (37%)

PERITONEAL_WASHING
0 352 (65%)
1 58 (11%)

RESIDUAL_TUMOR
0 375 (69%)
1 22 (4%)
2 16 (3%)

CLINICAL_STAGE
0 341 (63%)
2 122 (22%)
1 52 (10%)
3 30 (6%)

GRADE
2 324 (59%)
1 122 (22%)
0 99 (18%)

OS_STATUS
0 455 (83%)
1 90 (17%)

DFS_STATUS
0 396 (73%)
1 109 (20%)

group
0.0 503 (92%)
1.0 42 (8%)

stage_binary
0 393 (72%)
1 152 (28%)


## encoding

In [50]:
encode_list = []
for col in df.columns:
    if col not in ["case_id", "slide_id", "OS_STATUS", "DFS_STATUS", "OS_MONTHS", "DFS_MONTHS"]:
        if len(df[col].dropna().unique()) < 3:
            print("Binary:", col, df[col].dropna().unique())
        elif len(df[col].dropna().unique()) < 10:
            print("**Categ:", col, df[col].dropna().unique())
            encode_list.append(col)
        else:
            print("Cont:", col, df[col].dropna().astype(float).mean())

Binary: menopause [1 0]
Cont: HEIGHT 161.39102040816326
Cont: WEIGHT 87.46693548387097
**Categ: RACE ['WHITE' 'BLACK_OR_AA' 'ASIAN']
Binary: HISTORY_OTHER_MALIGNANCY [0 1]
Cont: AGE 63.786923076923074
**Categ: biopsy ['other' 'resection' 'excision']
Binary: open_sx [1 0]
Binary: PERITONEAL_WASHING [0 1]
Cont: TUMOR_INVASION_PERCENT 40.72038327526131
**Categ: RESIDUAL_TUMOR [0 2 1]
**Categ: CLINICAL_STAGE [0 1 2 3]
**Categ: GRADE [0 2 1]
Binary: group [0. 1.]
Binary: stage_binary [0 1]


In [51]:
def dummy_encode(df, col):
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    df.drop(col, axis=1, inplace=True)
    return df.reset_index(drop=True)
encoded_df = df.copy()
for col in encode_list:
    encoded_df = dummy_encode(encoded_df, col)
encoded_df.columns

  uniques = Index(uniques)


Index(['slide_id', 'case_id', 'menopause', 'HEIGHT', 'WEIGHT',
       'HISTORY_OTHER_MALIGNANCY', 'AGE', 'open_sx', 'PERITONEAL_WASHING',
       'TUMOR_INVASION_PERCENT', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS',
       'DFS_MONTHS', 'group', 'stage_binary', 'RACE_ASIAN', 'RACE_BLACK_OR_AA',
       'RACE_WHITE', 'biopsy_excision', 'biopsy_other', 'biopsy_resection',
       'RESIDUAL_TUMOR_0', 'RESIDUAL_TUMOR_1', 'RESIDUAL_TUMOR_2',
       'CLINICAL_STAGE_0', 'CLINICAL_STAGE_1', 'CLINICAL_STAGE_2',
       'CLINICAL_STAGE_3', 'GRADE_0', 'GRADE_1', 'GRADE_2'],
      dtype='object')

In [52]:
encoded_df.isna().sum()

slide_id                      0
case_id                       0
menopause                   135
HEIGHT                       79
WEIGHT                       64
HISTORY_OTHER_MALIGNANCY      0
AGE                           4
open_sx                      56
PERITONEAL_WASHING          318
TUMOR_INVASION_PERCENT      156
OS_STATUS                     0
OS_MONTHS                     0
DFS_STATUS                   85
DFS_MONTHS                   85
group                         0
stage_binary                  0
RACE_ASIAN                    0
RACE_BLACK_OR_AA              0
RACE_WHITE                    0
biopsy_excision               0
biopsy_other                  0
biopsy_resection              0
RESIDUAL_TUMOR_0              0
RESIDUAL_TUMOR_1              0
RESIDUAL_TUMOR_2              0
CLINICAL_STAGE_0              0
CLINICAL_STAGE_1              0
CLINICAL_STAGE_2              0
CLINICAL_STAGE_3              0
GRADE_0                       0
GRADE_1                       0
GRADE_2 

In [53]:
encoded_df = encoded_df.rename(columns={k: k.lower() for k in encoded_df.columns})
encoded_df.to_csv("./processed_clinical_data.csv", index=False)
encoded_df

Unnamed: 0,slide_id,case_id,menopause,height,weight,history_other_malignancy,age,open_sx,peritoneal_washing,tumor_invasion_percent,...,residual_tumor_0,residual_tumor_1,residual_tumor_2,clinical_stage_0,clinical_stage_1,clinical_stage_2,clinical_stage_3,grade_0,grade_1,grade_2
0,TCGA-BS-A0UM-01Z-00-DX1.1280476C-4274-4432-B37...,TCGA-BS-A0UM,1,160,90,0,64,1,0,22,...,1,0,0,1,0,0,0,1,0,0
1,TCGA-BS-A0UM-01A-01-BS1.09c675dc-fc1b-46d0-994...,TCGA-BS-A0UM,1,160,90,0,64,1,0,22,...,1,0,0,1,0,0,0,1,0,0
2,TCGA-BS-A0UM-01A-01-TS1.46f4a264-659e-4f14-b66...,TCGA-BS-A0UM,1,160,90,0,64,1,0,22,...,1,0,0,1,0,0,0,1,0,0
3,TCGA-BG-A0YU-01Z-00-DX1.02B228E1-6D67-4DCD-A87...,TCGA-BG-A0YU,0,157,107,0,37,1,,50,...,0,0,0,1,0,0,0,1,0,0
4,TCGA-BG-A0YU-01A-02-TSB.532f2d2b-50e6-421a-900...,TCGA-BG-A0YU,0,157,107,0,37,1,,50,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1299,TCGA-AJ-A5DV-01A-01-TS1.EB6105EF-8C73-4159-912...,TCGA-AJ-A5DV,1,160,147,0,65,,0,3,...,0,0,0,1,0,0,0,0,0,1
1300,TCGA-AX-A3FS-01A-01-TS1.27AE39D9-679D-42D4-AD3...,TCGA-AX-A3FS,1,139,98,0,82,1,0,,...,0,0,0,1,0,0,0,0,0,1
1301,TCGA-D1-A3JP-01Z-00-DX1.1EA8EA61-E887-44DF-943...,TCGA-D1-A3JP,1,159,59,1,61,1,0,0,...,1,0,0,1,0,0,0,0,0,1
1302,TCGA-BS-A0TC-01Z-00-DX1.78FBB6EA-2E49-4208-97B...,TCGA-BS-A0TC,1,147,68,0,69,1,0,9,...,1,0,0,1,0,0,0,1,0,0


In [54]:
df_os = encoded_df.drop(["dfs_status", "dfs_months"], axis=1).copy().rename(columns={"os_status": "event", "os_months": "survival_months"})
df_dfs = encoded_df.drop(["os_status", "os_months"], axis=1).copy().rename(columns={"dfs_status": "event", "dfs_months": "survival_months"})
df_os.shape, df_os.isna().sum()

((1304, 30),
 slide_id                      0
 case_id                       0
 menopause                   135
 height                       79
 weight                       64
 history_other_malignancy      0
 age                           4
 open_sx                      56
 peritoneal_washing          318
 tumor_invasion_percent      156
 event                         0
 survival_months               0
 group                         0
 stage_binary                  0
 race_asian                    0
 race_black_or_aa              0
 race_white                    0
 biopsy_excision               0
 biopsy_other                  0
 biopsy_resection              0
 residual_tumor_0              0
 residual_tumor_1              0
 residual_tumor_2              0
 clinical_stage_0              0
 clinical_stage_1              0
 clinical_stage_2              0
 clinical_stage_3              0
 grade_0                       0
 grade_1                       0
 grade_2                      

In [56]:
df_dfs = df_dfs[~df_dfs["event"].isna()]
df_dfs.shape, df_dfs.isna().sum()

((1219, 30),
 slide_id                      0
 case_id                       0
 menopause                   133
 height                       73
 weight                       58
 history_other_malignancy      0
 age                           4
 open_sx                      51
 peritoneal_washing          289
 tumor_invasion_percent      148
 event                         0
 survival_months               0
 group                         0
 stage_binary                  0
 race_asian                    0
 race_black_or_aa              0
 race_white                    0
 biopsy_excision               0
 biopsy_other                  0
 biopsy_resection              0
 residual_tumor_0              0
 residual_tumor_1              0
 residual_tumor_2              0
 clinical_stage_0              0
 clinical_stage_1              0
 clinical_stage_2              0
 clinical_stage_3              0
 grade_0                       0
 grade_1                       0
 grade_2                      

In [63]:
df_os["slide_id"] = df_os["slide_id"].str[:23]
df_dfs["slide_id"] = df_dfs["slide_id"].str[:23]

In [64]:
df_os.to_csv("../../datasets_csv/tcga_ucec_os.csv", index=False)
df_dfs.to_csv("../../datasets_csv/tcga_ucec_dfs.csv", index=False)

In [65]:
df_os[["case_id", "slide_id"]].to_csv("/media/nfs/SURV/TCGA_UCEC/slide_list.csv", index=False)

In [62]:
df_os["slide_id"][df_os["slide_id"].str[:23] == "TCGA-BS-A0UM-01Z-00-DX1"].item()

'TCGA-BS-A0UM-01Z-00-DX1.1280476C-4274-4432-B379-41B18DF3DAB'

# Inspect the Genetic Data

In [42]:
dname = "ov_tcga"

In [43]:
df_os.shape, df_dfs.shape

((1231, 21), (1036, 21))

## 1. Mut

In [47]:
df = pd.read_csv(f"./{dname}/data_mutations.txt", sep="\t")
# exclude silent mutations
df_woSilent = df[df['Variant_Classification'] != 'Silent']

pivot_df = pd.pivot_table(df_woSilent, values='Variant_Classification', index=['Tumor_Sample_Barcode'], columns=['Hugo_Symbol'], 
                          aggfunc='count', fill_value=0).reset_index()

pivot_df.iloc[:, 1:] = pivot_df.iloc[:, 1:].clip(upper=1)

mut_genes = list(pivot_df.columns[1:])

pivot_df["case_id"] = pivot_df["Tumor_Sample_Barcode"].str.rsplit("-", 1).str[0]
mut_df = pivot_df[["case_id"]+mut_genes].rename(columns={k: k+"_mut" for k in mut_genes}).reset_index(drop=True)

# cases with the follow-up data
mut_df = mut_df[mut_df["case_id"].isin(df_os["case_id"].values)]

print(mut_df["case_id"].duplicated().any())
print(mut_df.columns.duplicated().any())
print(mut_df.isna().any().any())
mut_df

False
False
False


Hugo_Symbol,case_id,A1CF_mut,A2M_mut,A2ML1_mut,AACS_mut,AADACL4_mut,AAED1_mut,AAMP_mut,AARS_mut,AARS2_mut,...,ZWILCH_mut,ZXDA_mut,ZXDB_mut,ZXDC_mut,ZYG11B_mut,ZZEF1_mut,ZZZ3_mut,hsa-mir-7162_mut,uc003vym.2_mut,uc003vyo.2_mut
0,TCGA-04-1331,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TCGA-04-1332,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TCGA-04-1336,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TCGA-04-1337,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TCGA-04-1338,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,TCGA-61-2104,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312,TCGA-61-2109,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
313,TCGA-61-2110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
314,TCGA-61-2111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
mut_df.to_csv(f"../../datasets_csv/tcga_ov_os_mut.csv.zip", compression="zip", index=False)
mut_df_dfs = mut_df[mut_df["case_id"].isin(df_dfs["case_id"].values)]
print(mut_df_dfs.shape)
mut_df_dfs.to_csv(f"../../datasets_csv/tcga_ov_dfs_mut.csv.zip", compression="zip", index=False)

(266, 8766)


## 2. CNV

In [50]:
df = pd.read_csv(f"./{dname}/data_cna.txt", sep="\t")

duplicated_genes = df["Hugo_Symbol"].duplicated(keep=False)

# Separate duplicates and non-duplicates
df_duplicated = df[duplicated_genes].copy()
df_non_duplicated = df[~duplicated_genes].copy()

# Average the duplicated values
averaged_values = df_duplicated.groupby("Hugo_Symbol").median().reset_index()

df_final = pd.concat([df_non_duplicated, averaged_values], ignore_index=True)
df_final = df_final.sort_values("Hugo_Symbol").reset_index(drop=True)
df_final.drop(columns="Entrez_Gene_Id", inplace=True)
cnv_df = df_final.set_index("Hugo_Symbol").T.reset_index().rename(columns={"index": "case_id"})
cnv_df["case_id"] = cnv_df["case_id"].str.rsplit("-", 1).str[0]

cnv_cols = list(cnv_df.columns[1:])
cnv_df = cnv_df[["case_id"]+ cnv_cols].rename(columns={col: col+"_cnv" for col in cnv_cols})

cnv_df = cnv_df[cnv_df["case_id"].isin(df_os["case_id"].values)]
print(cnv_df["case_id"].duplicated().any())
print(cnv_df.columns.duplicated().any())
print(cnv_df.isna().any().any())
cnv_df

False
False
False


Hugo_Symbol,case_id,7SK|ENSG00000232512.2_cnv,7SK|ENSG00000249352.3_cnv,7SK|ENSG00000254144.2_cnv,7SK|ENSG00000260682.2_cnv,7SK|ENSG00000271765.1_cnv,7SK|ENSG00000271814.1_cnv,7SK|ENSG00000271818.1_cnv,A1BG_cnv,A1CF_cnv,...,snoZ185_cnv,snoZ247_cnv,snoZ278_cnv,snoZ40_cnv,snoZ5_cnv,snoZ6|ENSG00000252200.1_cnv,snoZ6|ENSG00000253067.1_cnv,snoZ6|ENSG00000264452.1_cnv,snoZ6|ENSG00000266692.1_cnv,snosnR66_cnv
0,TCGA-04-1331,0.0,-1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0
1,TCGA-04-1332,1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,0.0,...,-1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
2,TCGA-04-1335,-1.0,-1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0
3,TCGA-04-1336,0.0,-1.0,2.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,...,0.0,1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-04-1337,0.0,1.0,0.0,-1.0,-1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,TCGA-61-2614,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,2.0,-1.0,...,1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0
575,TCGA-OY-A56P,0.0,0.0,1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,1.0,1.0,-1.0,0.0,0.0,0.0,1.0
576,TCGA-OY-A56Q,0.0,-1.0,1.0,1.0,0.0,0.0,-1.0,-1.0,1.0,...,1.0,1.0,-1.0,1.0,1.0,2.0,-1.0,0.0,0.0,-1.0
577,TCGA-VG-A8LO,0.0,-1.0,2.0,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [53]:
cnv_df.to_csv(f"../../datasets_csv/tcga_ov_os_cnv.csv.zip", compression="zip", index=False)
cnv_df_dfs = cnv_df[cnv_df["case_id"].isin(df_dfs["case_id"].values)]
print(cnv_df_dfs.shape)
cnv_df_dfs.to_csv(f"../../datasets_csv/tcga_ov_dfs_cnv.csv.zip", compression="zip", index=False)

(477, 24766)


## 3. mRNA-Seq

In [54]:
df = pd.read_csv(f"./{dname}/data_mrna_seq_v2_rsem.txt", sep="\t")
duplicated_genes = df["Hugo_Symbol"].duplicated(keep=False)

# Separate duplicates and non-duplicates
df_duplicated = df[duplicated_genes].copy()
df_non_duplicated = df[~duplicated_genes].copy()

# Average the duplicated values
averaged_values = df_duplicated.groupby("Hugo_Symbol").mean().reset_index()

df_final = pd.concat([df_non_duplicated, averaged_values], ignore_index=True)
df_final = df_final.sort_values("Hugo_Symbol").reset_index(drop=True)

recurrent_tm = [i for i in df_final.columns[2:] if i.rsplit("-")[-1] != "01"]
df_final.drop(columns="Entrez_Gene_Id", inplace=True)
df_final = df_final.drop(columns=recurrent_tm)
df_final = df_final.drop(df_final.index[df_final["Hugo_Symbol"].isna()])
df_cases = df_final.set_index("Hugo_Symbol").T.reset_index().rename(columns={"index": "case_id"})
df_cases["case_id"] = df_cases["case_id"].str.rsplit("-", 1).str[0]
rna_genes = list(df_cases.columns[1:])
rna_df = df_cases[["case_id"]+ rna_genes].rename(columns={col: col+"_rna" for col in rna_genes})
rna_df = rna_df[rna_df["case_id"].isin(df_os["case_id"].values)]

print(rna_df["case_id"].duplicated().any())
print(rna_df.columns.duplicated().any())
print(rna_df.isna().any().any())

rna_df

False
False
False


Hugo_Symbol,case_id,133K02_rna,5T4_rna,A-C1_rna,A1BG_rna,A1BG-AS1_rna,A1CF_rna,A2M_rna,A2M-AS1_rna,A2ML1_rna,...,ZWILCH_rna,ZWINT_rna,ZWS1_rna,ZXDA_rna,ZXDB_rna,ZXDC_rna,ZYG11A_rna,ZYG11B_rna,ZYX_rna,ZZZ3_rna
0,TCGA-04-1348,431.8365,331.8096,97.6069,66.4695,36.3243,0.000,5899.8279,118.4566,7.5289,...,928.9002,794.5684,373.7564,36.0312,235.2783,827.1041,5.6467,560.0968,15871.2019,475.9344
2,TCGA-04-1362,386.5598,1007.1073,90.6827,41.6412,23.2465,0.331,3350.4207,71.3613,5.6263,...,421.6182,605.3235,385.8978,93.3304,788.6746,1575.0325,56.5939,915.7627,6137.2982,803.8987
3,TCGA-04-1364,334.9785,345.5791,100.4936,187.0368,114.6008,0.000,1455.2316,67.8607,5.0883,...,913.9953,1079.9878,661.9006,13.9928,137.3836,1138.5030,13.1447,811.5809,5972.3706,444.8006
4,TCGA-04-1365,277.2337,483.9536,46.7078,23.9295,10.4957,0.000,3999.3792,52.1501,3.3148,...,646.5632,1882.7784,312.7919,35.8596,328.4617,1455.7782,7.2322,1031.7915,7211.9934,787.1026
5,TCGA-04-1514,316.0622,114.2775,24.1796,32.8123,20.1900,0.000,3224.5797,224.7582,3.7421,...,556.5630,1082.3258,483.0167,126.6552,959.1249,965.1698,20.4375,1082.6137,5867.2855,1149.1077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,TCGA-61-2113,261.0291,2215.4276,49.2706,70.4394,106.1623,0.000,5365.7972,23.5625,2.4461,...,540.3477,458.1113,584.2579,23.4122,159.3431,1277.1905,117.0612,776.4480,6787.8047,972.1324
299,TCGA-OY-A56P,226.8219,514.5571,35.7999,43.1736,47.8864,0.000,8773.5936,84.2153,98.3162,...,325.1728,303.2307,981.5590,99.3849,637.4523,2368.4052,28.5865,2075.0596,10157.0254,2826.3236
300,TCGA-OY-A56Q,169.9434,376.8744,25.3249,33.6854,25.3482,0.000,10006.7877,582.3492,269.5768,...,392.0826,625.4582,744.4185,48.9837,298.2339,1854.3819,119.6268,1157.2809,10968.3439,2466.5112
301,TCGA-VG-A8LO,291.8109,168.7737,34.1412,27.9604,17.9853,0.000,1888.1746,59.8084,4.5092,...,824.4915,602.6250,493.1154,72.4696,492.1491,1916.4184,6.4417,1528.9476,8124.0035,1297.3669


In [55]:
rna_df.to_csv(f"../../datasets_csv/tcga_ov_os_rna.csv.zip", compression="zip", index=False)
rna_df_dfs = rna_df[rna_df["case_id"].isin(df_dfs["case_id"].values)]
print(rna_df_dfs.shape)
rna_df_dfs.to_csv(f"../../datasets_csv/tcga_ov_dfs_rna.csv.zip", compression="zip", index=False)

(245, 20514)


## 4. RPPA-Exp

In [56]:
df = pd.read_csv(f"./{dname}/data_rppa.txt", sep="\t")
print(df["Composite.Element.REF"].duplicated(keep=False).any())
df_final = df.sort_values("Composite.Element.REF").reset_index(drop=True)

recurrent_tm = [i for i in df_final.columns[2:] if i.rsplit("-")[-1] != "01"]
print(df_final.shape)
print("Recurrent tumors: ", len(recurrent_tm))
df_final = df_final.drop(columns=recurrent_tm)
print(df_final.shape)
df_final = df_final.drop(df_final.index[df_final["Composite.Element.REF"].isna()])
print(df_final.shape)
df_clean = df_final.dropna(thresh=int(df_final.shape[1]*.8))
print("After removing empty proteins in > 80%: ", df_clean.shape)
df_cases = df_clean.set_index("Composite.Element.REF").T.reset_index().rename(columns={"index": "case_id"})
df_cases["case_id"] = df_cases["case_id"].str.rsplit("-", 1).str[0]

pro_genes = list(df_cases.columns[1:])
pro_df = df_cases[["case_id"]+ pro_genes].rename(columns={col: col+"_pro" for col in pro_genes})
pro_df = pro_df[pro_df["case_id"].isin(df_os["case_id"].values)]
print("Cases with cli data: ", pro_df.shape)
print(pro_df["case_id"].duplicated().any())
print(pro_df.columns.duplicated().any())
print(pro_df.isna().any().any())

pro_df

False
(208, 437)
Recurrent tumors:  11
(208, 426)
(208, 426)
After removing empty proteins in > 80%:  (204, 426)
Cases with cli data:  (414, 205)
False
False
True


Composite.Element.REF,case_id,ACACA ACACB|ACC_pS79_pro,ACACA|ACC1_pro,ACVRL1|ACVRL1_pro,AKT1 AKT2 AKT3|Akt_pro,AKT1 AKT2 AKT3|Akt_pS473_pro,AKT1 AKT2 AKT3|Akt_pT308_pro,AKT1S1|PRAS40_pT246_pro,ANXA1|Annexin-1_pro,ANXA7|Annexin_VII_pro,...,XBP1|XBP1_pro,XRCC1|XRCC1_pro,XRCC5|Ku80_pro,YAP1|YAP_pro,YAP1|YAP_pS127_pro,YBX1|YB-1_pro,YBX1|YB-1_pS102_pro,YWHAB|14-3-3_beta_pro,YWHAE|14-3-3_epsilon_pro,YWHAZ|14-3-3_zeta_pro
0,TCGA-04-1335,0.050157,0.338735,0.028899,-0.362076,1.902311,1.241415,-0.128162,0.415967,0.045857,...,0.457778,-0.612430,0.273903,-0.246701,0.053472,0.234894,-0.110302,-0.008553,0.059854,-0.040743
1,TCGA-04-1336,-0.122685,-0.514174,-0.072171,0.124800,0.118800,0.036025,-0.068028,-0.600145,-0.070430,...,0.124803,-0.026779,-0.016853,-0.082328,0.424904,-0.322371,-0.106064,-0.115064,-0.075638,0.466381
2,TCGA-04-1338,-0.788799,-0.684709,0.265096,-0.116073,0.339617,0.454237,-0.418403,0.269226,0.147967,...,0.185344,-0.147648,-0.454878,0.415796,-0.049777,-0.204733,-0.283831,0.182304,0.255061,0.500384
3,TCGA-04-1341,0.544080,0.482856,0.474257,-0.160586,0.590164,0.227633,0.259963,-0.180727,0.128443,...,0.066478,0.025270,-0.266316,-0.361657,0.345984,0.413982,0.746081,0.184517,0.032904,0.126560
4,TCGA-04-1342,0.291188,0.317670,0.169391,-0.319387,1.022617,0.570207,-0.071549,-0.381018,0.073137,...,0.938900,-0.108158,-0.158992,-0.193000,0.300835,0.797873,-0.160503,-0.020604,-0.134749,-0.273026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,TCGA-61-2614,0.294133,0.042560,-0.072756,-0.003160,0.807530,0.453940,0.137475,0.124468,-0.065395,...,-0.042419,-0.282149,0.052371,-0.333259,-0.507655,0.086028,-0.002333,-0.277144,-0.066282,-0.476036
421,TCGA-OY-A56P,0.678777,0.746679,-0.136984,-0.508459,-0.097865,-0.177417,0.148403,0.210075,-0.120179,...,0.034178,0.389113,-0.013918,0.588104,0.698385,0.463170,-0.131119,-0.140199,-0.069463,-0.157745
422,TCGA-OY-A56Q,0.069934,-0.019771,-0.088824,-0.042172,0.078807,-0.063589,0.235932,0.683164,0.330481,...,-0.053597,0.891833,0.170997,0.341120,0.716418,-0.088501,-0.041030,-0.085609,0.089297,0.261936
423,TCGA-VG-A8LO,0.116778,-0.085238,-0.027367,-0.176764,0.325561,0.058855,0.120482,0.470819,0.367324,...,0.270166,-0.315340,-0.263736,0.262682,0.413353,-0.274155,-0.104356,0.029152,0.069522,0.189983


In [57]:
pro_df.to_csv(f"../../datasets_csv/tcga_ov_os_pro.csv.zip", compression="zip", index=False)
pro_df_dfs = pro_df[pro_df["case_id"].isin(df_dfs["case_id"].values)]
print(pro_df_dfs.shape)
pro_df_dfs.to_csv(f"../../datasets_csv/tcga_ov_dfs_pro.csv.zip", compression="zip", index=False)

(359, 205)


## 5. DNAm

In [58]:
df = pd.read_csv(f"./{dname}/data_methylation_hm27.txt", sep="\t")
duplicated_genes = df["Hugo_Symbol"].duplicated(keep=False)

# Separate duplicates and non-duplicates
df_duplicated = df[duplicated_genes].copy()
df_non_duplicated = df[~duplicated_genes].copy()

# Average the duplicated values
averaged_values = df_duplicated.groupby("Hugo_Symbol").mean().reset_index()

df_final = pd.concat([df_non_duplicated, averaged_values], ignore_index=True)
df_final = df_final.sort_values("Hugo_Symbol").reset_index(drop=True)
print(df_final.shape)

df_clean = df_final.dropna(thresh=int(df_final.shape[1]*.8))
print(df_clean.shape)
recurrent_tm = [i for i in df_final.columns[2:] if i.rsplit("-")[-1] != "01"]
print(len(recurrent_tm))
df_clean = df_clean.drop(columns=recurrent_tm)
df_clean = df_clean.drop(df_clean.index[df_clean["Hugo_Symbol"].isna()])
print(df_clean.shape)

df_cases = df_clean.set_index("Hugo_Symbol").T.reset_index().rename(columns={"index": "case_id"})
print(df_cases.columns.duplicated().any())
df_cases["case_id"] = df_cases["case_id"].str.rsplit("-", 1).str[0]
dna_genes = list(df_cases.columns[1:])
dna_df = df_cases[["case_id"]+ dna_genes].rename(columns={col: col+"_dna" for col in dna_genes})
dna_df = dna_df[dna_df["case_id"].isin(df_os["case_id"].values)]
print("Cases with cli data: ", dna_df.shape)
print(dna_df["case_id"].duplicated().any())
print(dna_df.columns.duplicated().any())
print(dna_df.isna().any().any())
dna_df

(14872, 593)
(14371, 593)
12
(14371, 581)
False
Cases with cli data:  (559, 14372)
False
False
True


Hugo_Symbol,case_id,A1BG_dna,A1CF_dna,A2BP1_dna,A2M_dna,A2ML1_dna,A4GALT_dna,A4GNT_dna,AAAS_dna,AACS_dna,...,ZSWIM1_dna,ZSWIM3_dna,ZSWIM7_dna,ZUFSP_dna,ZW10_dna,ZWILCH_dna,ZWINT_dna,ZYX_dna,ZZEF1_dna,ZZZ3_dna
0,TCGA-04-1331,0.981805,0.816419,0.196513,0.582584,0.469370,0.269578,0.806530,0.067518,0.098671,...,0.049296,0.448896,0.026618,0.034261,0.021411,0.048051,0.014654,0.012361,0.015735,0.018789
1,TCGA-04-1332,0.982110,0.565820,0.022767,0.552774,0.829701,0.296869,0.700986,0.057805,0.017430,...,0.038076,0.182496,0.015868,0.029626,0.022453,0.047327,0.013715,0.012677,0.011100,0.025092
2,TCGA-04-1335,0.957049,0.603784,0.681104,0.394409,0.669948,0.249889,0.820301,0.118457,0.011785,...,0.284387,0.393827,0.034591,0.134409,0.023022,0.046214,0.014255,0.008811,0.024669,0.072879
3,TCGA-04-1336,0.952807,,0.009456,0.466297,0.228725,0.386736,0.489553,0.083912,0.011477,...,0.347038,0.401215,0.032352,0.052731,0.025746,0.033919,0.014425,0.013426,0.014864,0.046425
4,TCGA-04-1337,0.986589,0.698739,0.170477,0.667076,0.896899,0.339014,0.928064,0.049442,0.034056,...,0.022741,0.387501,0.019475,0.022706,0.017513,0.034557,0.012591,0.012550,0.011353,0.016047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,TCGA-61-2111,0.954911,0.268565,0.390809,0.805110,0.733029,0.080300,0.619909,0.150095,0.011890,...,0.037573,0.031485,0.022357,0.014230,0.010066,0.034627,0.014369,0.007447,0.015375,0.013467
566,TCGA-61-2113,0.949360,0.571589,0.131924,0.809480,0.430048,0.172873,0.375198,0.125191,0.087687,...,0.060467,0.137753,0.038985,0.018988,0.018274,0.045965,0.017367,0.010630,0.021257,0.025499
567,TCGA-61-2612,0.921891,0.815743,0.240453,0.902233,0.535340,0.331119,0.868159,0.114558,0.177753,...,0.066533,0.266707,0.026484,0.025178,0.037937,0.045864,0.034713,0.116062,0.040508,0.054314
568,TCGA-61-2613,0.928122,0.834670,0.071404,0.762165,0.549529,0.292376,0.895432,0.099741,0.014708,...,0.100164,0.036415,0.043217,0.038979,0.034316,0.032903,0.025793,0.162768,0.063007,0.045219


In [59]:
dna_df.to_csv(f"../../datasets_csv/tcga_ov_os_dna.csv.zip", compression="zip", index=False)
dna_df_dfs = dna_df[dna_df["case_id"].isin(df_dfs["case_id"].values)]
print(dna_df_dfs.shape)
dna_df_dfs.to_csv(f"../../datasets_csv/tcga_ov_dfs_dna.csv.zip", compression="zip", index=False)

(478, 14372)


In [60]:
df_os_dx = df_os[df_os["group"]==0]
df_dfs_dx = df_dfs[df_dfs["group"]==0]
print(df_os_dx.shape, df_dfs_dx.shape)

(303, 21) (221, 21)


In [63]:
len(df_os["case_id"].unique()), len(df_dfs["case_id"].unique())

(574, 490)

In [69]:
dna_df_os_dx = dna_df[dna_df["case_id"].isin(df_os_dx["case_id"].values)]
cnv_df_os_dx = cnv_df[cnv_df["case_id"].isin(df_os_dx["case_id"].values)]
mut_df_os_dx = mut_df[mut_df["case_id"].isin(df_os_dx["case_id"].values)]
rna_df_os_dx = rna_df[rna_df["case_id"].isin(df_os_dx["case_id"].values)]
pro_df_os_dx = pro_df[pro_df["case_id"].isin(df_os_dx["case_id"].values)]
print(mut_df_os_dx.shape, cnv_df_os_dx.shape, rna_df_os_dx.shape, pro_df_os_dx.shape, dna_df_os_dx.shape)

dna_df_dfs_dx = dna_df[dna_df["case_id"].isin(df_dfs_dx["case_id"].values)]
cnv_df_dfs_dx = cnv_df[cnv_df["case_id"].isin(df_dfs_dx["case_id"].values)]
mut_df_dfs_dx = mut_df[mut_df["case_id"].isin(df_dfs_dx["case_id"].values)]
rna_df_dfs_dx = rna_df[rna_df["case_id"].isin(df_dfs_dx["case_id"].values)]
pro_df_dfs_dx = pro_df[pro_df["case_id"].isin(df_dfs_dx["case_id"].values)]
print(mut_df_dfs_dx.shape, cnv_df_dfs_dx.shape, rna_df_dfs_dx.shape, pro_df_dfs_dx.shape, dna_df_dfs_dx.shape)

(61, 8766) (98, 24766) (69, 20514) (71, 205) (95, 14372)
(42, 8766) (73, 24766) (49, 20514) (54, 205) (70, 14372)


In [70]:
os_cases = [312, 558, 299, 414, 559]
print([round(i*100/574) for i in os_cases])

dfs_cases = [266, 477, 245, 359, 478]
print([round(i*100/490) for i in dfs_cases])

os_dx_cases = [61, 98, 69, 71, 95]
print([round(i*100/303) for i in os_dx_cases])

dfs_dx_cases = [42, 73, 49, 54, 70]
print([round(i*100/221) for i in dfs_dx_cases])

[54, 97, 52, 72, 97]
[54, 97, 50, 73, 98]
[20, 32, 23, 23, 31]
[19, 33, 22, 24, 32]
