# Match slides

In [32]:
import os
import numpy as np
import pandas as pd

In [33]:
# List is acquired from gdc website by adding 
# a filter names "is_ffpe"
# as it could not be found among the data files
ffpe_list = pd.read_csv("./tcga_ov/ov_ffpe_slides.csv")["slide_id"].values
len(ffpe_list)

105

In [34]:
ca_type = "TCGA_OV"

slide_folder = f"/media/nfs/SURV/{ca_type}/Slides"
patch_folder = f"/media/nfs/SURV/{ca_type}/SP1024/patches"
feat_folder = f"/media/nfs/SURV/{ca_type}/Feats1024/"
print(len(os.listdir(slide_folder)), len(os.listdir(patch_folder)), [len(os.listdir(feat_folder+fe)) for fe in ["CTP", "SSL", "Res"]])
feat_folder = feat_folder+"CTP/"

1274 1250 [1250, 1250, 1250]


In [36]:
df = pd.read_csv("./tcga_ov/tcga/data_clinical_patient.txt", header=4, sep="\t")

def change_na_to_pd_na(texts, df):
    df_copy = df.copy()
    for col in df_copy.columns:
        df_copy[col] = df_copy[col].replace(texts, pd.NA)
    return df_copy

df = change_na_to_pd_na(['[Not Available]', '[Not Applicable]', '[Unknown]', '[Discrepancy]'], df)
print("All clinical data: ", len(df))
print("Missing follow-up data: ", len(df.index[df["OS_MONTHS"].isna()]))

df = df.drop(df.index[df["OS_MONTHS"].isna()]).reset_index(drop=True)
print("Available follow-up data:", len(df))

All clinical data:  587
Missing follow-up data:  5
Available follow-up data: 582


In [37]:
files = [i[:-4] for i in os.listdir(slide_folder)]
cases = np.unique([i[:12] for i in files])

print("Total files/cases: ", len(files), len(cases))
cases = [i for i in cases if i in df["PATIENT_ID"].values]
files = [i for i in files if i[:12] in cases]
print("Slides with available follow-up files/cases: ", len(files), len(cases))

ffpe_files = [i for i in files if i in ffpe_list]
frozen_files = [i for i in files if i not in ffpe_list]
ffpe_cases = np.unique([i[:12] for i in ffpe_files])
frozen_cases = np.unique([i[:12] for i in frozen_files])

print("\tFFPE files/cases: ", len(ffpe_files), len(ffpe_cases))
print("\tFrozen files/cases: ", len(frozen_files), len(frozen_cases))

Total files/cases:  1274 585
Slides with available follow-up files/cases:  1263 580
	FFPE files/cases:  104 104
	Frozen files/cases:  1159 580


In [40]:
seg_files = [i[:-3] for i in os.listdir(patch_folder)] # segmented files
seg_files = [i for i in seg_files if i in files] # with follow-up data
cases = np.unique([i[:12] for i in seg_files])
print("Segmented slides with available follow-up (files/cases): ", len(seg_files), len(cases))

ffpe_files = [i for i in seg_files if i in ffpe_list]
seg_frozen_files = [i for i in seg_files if i not in ffpe_list]
ffpe_cases = np.unique([i[:12] for i in ffpe_files])
frozen_cases = np.unique([i[:12] for i in seg_frozen_files])

print("\tFFPE (files/cases): ", len(ffpe_files), len(ffpe_cases))
print("\tFrozen (files/cases): ", len(seg_frozen_files), len(frozen_cases))

Segmented slides with available follow-up (files/cases):  1240 579
	FFPE (files/cases):  104 104
	Frozen (files/cases):  1136 579


In [41]:
drop_ids = [i for i in df["PATIENT_ID"].values if i not in cases]
df = df[~df["PATIENT_ID"].isin(drop_ids)]
print(df.shape)
drop_ids

(579, 53)


['TCGA-13-0764', 'TCGA-61-2610', 'TCGA-61-2611']

In [42]:
sample_df = pd.read_csv("./tcga_ov/tcga/data_clinical_sample.txt", sep="\t", header=4)
sample_df = sample_df[sample_df["PATIENT_ID"].isin(df["PATIENT_ID"].values)]
other_site_tm = sample_df["PATIENT_ID"][sample_df["TUMOR_TISSUE_SITE"] != "Ovary"].values
print(len(other_site_tm))
df = df[~df["PATIENT_ID"].isin(other_site_tm)]
print("After excluding other sites (files/cases): ")

ffpe_files = [i for i in ffpe_files if i[:12] in df["PATIENT_ID"].values]
frozen_files = [i for i in seg_frozen_files if i[:12] in df["PATIENT_ID"].values]
ffpe_cases = np.unique([i[:12] for i in ffpe_files])
frozen_cases = np.unique([i[:12] for i in frozen_files])
print("\tTotal (files/cases): ", len(ffpe_files+frozen_files), len(df))
print("\tFFPE (files/cases): ", len(ffpe_files), len(ffpe_cases))
print("\tFrozen (files/cases): ", len(frozen_files), len(frozen_cases))
sample_df[["PATIENT_ID", "TUMOR_TISSUE_SITE"]][sample_df["TUMOR_TISSUE_SITE"] != "Ovary"]

5
After excluding other sites (files/cases): 
	Total (files/cases):  1231 574
	FFPE (files/cases):  104 104
	Frozen (files/cases):  1127 574


Unnamed: 0,PATIENT_ID,TUMOR_TISSUE_SITE
283,TCGA-29-1766,Peritoneum ovary
451,TCGA-24-2290,Omentum
452,TCGA-24-2293,Omentum
511,TCGA-36-2529,Peritoneum ovary
543,TCGA-29-A5NZ,Omentum


In [45]:
# Remaining Slides
ffpe_frozen_cases = [i for i in ffpe_cases if i in frozen_cases]
only_ffpe_cases = [i for i in ffpe_cases if i not in frozen_cases]
only_frozen_cases = [i for i in frozen_cases if i not in ffpe_cases]

only_ffpe_files = [i for i in ffpe_files if i[:12] in only_ffpe_cases]
ffpe_frozen_files = [i for i in ffpe_files+frozen_files if i[:12] in ffpe_frozen_cases]
only_frozen_files = [i for i in frozen_files if i[:12] in only_frozen_cases]
print("FFPE+Frozen (files/cases/ratio):", len(ffpe_frozen_files), len(ffpe_frozen_cases), round(len(ffpe_frozen_cases)/(len(ffpe_frozen_cases)+len(only_frozen_cases)), 2))
print("Only Frozen (files/cases/ratio):", len(only_frozen_files), len(only_frozen_cases), round(len(only_frozen_cases)/(len(ffpe_frozen_cases)+len(only_frozen_cases)), 2))

FFPE+Frozen (files/cases/ratio): 301 104 0.18
Only Frozen (files/cases/ratio): 930 470 0.82


In [None]:
ffpe_frozen_df = pd.DataFrame({"slide_id": ffpe_frozen_files})
ffpe_frozen_df["case_id"] = ffpe_frozen_df["slide_id"].str[:12]
ffpe_frozen_df = pd.merge(ffpe_frozen_df, df.rename(columns={"PATIENT_ID": "case_id"}), on="case_id")
only_frozen_df = pd.DataFrame({"slide_id": only_frozen_files})
only_frozen_df["case_id"] = only_frozen_df["slide_id"].str[:12]
only_frozen_df = pd.merge(only_frozen_df, df.rename(columns={"PATIENT_ID": "case_id"}), on="case_id")
ffpe_frozen_df.shape, only_frozen_df.shape

((301, 54), (930, 54))

In [None]:
ffpe_frozen_df["group"] = np.zeros(len(ffpe_frozen_df))
only_frozen_df["group"] = np.ones(len(only_frozen_df))
df = pd.concat([ffpe_frozen_df, only_frozen_df])
df.isna().sum()

slide_id                                         0
case_id                                          0
OTHER_PATIENT_ID                                 0
FORM_COMPLETION_DATE                             0
PROSPECTIVE_COLLECTION                        1197
RETROSPECTIVE_COLLECTION                      1197
SEX                                              0
RACE                                            53
ETHNICITY                                      450
JEWISH_RELIGION_HERITAGE_INDICATOR            1177
HISTORY_OTHER_MALIGNANCY                      1211
HISTORY_NEOADJUVANT_TRTYN                        0
INITIAL_PATHOLOGIC_DX_YEAR                       0
METHOD_OF_INITIAL_SAMPLE_PROCUREMENT             8
METHOD_OF_INITIAL_SAMPLE_PROCUREMENT_OTHER    1231
TUMOR_STATUS                                   155
GRADE                                            7
RESIDUAL_TUMOR                                1113
VASCULAR_INVASION_INDICATOR                    881
LYMPHOVASCULAR_INVASION_INDICAT

In [48]:
# Drop columns with more than 50% missing values
drop_cols = [col for col in df.columns if df[col].isna().sum() > len(df)*.5]
print(len(drop_cols))
df.drop(columns=drop_cols, inplace=True)
df.shape

29


(1231, 26)

In [49]:
drop_cols

['PROSPECTIVE_COLLECTION',
 'RETROSPECTIVE_COLLECTION',
 'JEWISH_RELIGION_HERITAGE_INDICATOR',
 'HISTORY_OTHER_MALIGNANCY',
 'METHOD_OF_INITIAL_SAMPLE_PROCUREMENT_OTHER',
 'RESIDUAL_TUMOR',
 'VASCULAR_INVASION_INDICATOR',
 'LYMPHOVASCULAR_INVASION_INDICATOR',
 'KARNOFSKY_PERFORMANCE_SCORE',
 'ECOG_SCORE',
 'PERFORMANCE_STATUS_TIMING',
 'RADIATION_TREATMENT_ADJUVANT',
 'PHARMACEUTICAL_TX_ADJUVANT',
 'TREATMENT_OUTCOME_FIRST_COURSE',
 'DAYS_TO_TUMOR_PROGRESSION',
 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT',
 'CLIN_M_STAGE',
 'CLIN_N_STAGE',
 'CLIN_T_STAGE',
 'DAYS_TO_PATIENT_PROGRESSION_FREE',
 'DISEASE_CODE',
 'EXTRANODAL_INVOLVEMENT',
 'PATH_M_STAGE',
 'PATH_N_STAGE',
 'PATH_T_STAGE',
 'AJCC_PATHOLOGIC_TUMOR_STAGE',
 'PROJECT_CODE',
 'STAGE_OTHER',
 'AJCC_STAGING_EDITION']

In [50]:
# Drop unrelated columns
drop_list = ['OTHER_PATIENT_ID', 'FORM_COMPLETION_DATE',
       'SEX', 'INITIAL_PATHOLOGIC_DX_YEAR', 
       'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS',
       'ICD_10', 'ICD_O_3_HISTOLOGY',
       'ICD_O_3_SITE', 'INFORMED_CONSENT_VERIFIED', "TISSUE_SOURCE_SITE"]
df = df.drop(drop_list, axis=1)
df.shape

(1231, 16)

# Inspect the clinical variables

In [52]:
df = df.rename(columns={'PATIENT_ID': 'case_id'}).reset_index(drop=True)
case_df = df.drop_duplicates(['case_id'])
df.shape, case_df.shape

((1231, 16), (574, 16))

## continous vars

In [53]:
cont_cols = [col for col in case_df.columns if len(case_df[col].unique()) > 8]
cont_cols

['slide_id', 'case_id', 'AGE', 'CLINICAL_STAGE', 'OS_MONTHS', 'DFS_MONTHS']

In [54]:
cont_cols.remove("CLINICAL_STAGE")
cont_cols.remove("slide_id")
cont_cols.remove("case_id")
len(cont_cols)

3

In [55]:
case_df["AGE"].describe()

count    574.000000
mean      59.716028
std       11.551139
min       26.000000
25%       51.000000
50%       59.000000
75%       68.000000
max       89.000000
Name: AGE, dtype: float64

## categorical vars

In [59]:
cat_cols = [col for col in case_df.columns if col not in cont_cols]
cat_cols

['slide_id',
 'case_id',
 'RACE',
 'ETHNICITY',
 'HISTORY_NEOADJUVANT_TRTYN',
 'METHOD_OF_INITIAL_SAMPLE_PROCUREMENT',
 'TUMOR_STATUS',
 'GRADE',
 'CLINICAL_STAGE',
 'HISTOLOGICAL_DIAGNOSIS',
 'OS_STATUS',
 'DFS_STATUS',
 'group']

In [60]:
cat_cols.remove("slide_id")
cat_cols.remove("case_id")
len(cat_cols)

11

In [61]:
for col in cat_cols:
    print()
    print(col)
    for i, j in dict(case_df[col].value_counts()).items():
        print(i, j, f"({int(round(j/len(case_df), 2)*100)}%)")


RACE
WHITE 491 (86%)
BLACK OR AFRICAN AMERICAN 33 (6%)
ASIAN 20 (3%)
AMERICAN INDIAN OR ALASKA NATIVE 3 (1%)
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER 1 (0%)

ETHNICITY
NOT HISPANIC OR LATINO 338 (59%)
HISPANIC OR LATINO 11 (2%)

HISTORY_NEOADJUVANT_TRTYN
No 573 (100%)
Yes 1 (0%)

METHOD_OF_INITIAL_SAMPLE_PROCUREMENT
Tumor resection 460 (80%)
Cytology (e.g. Peritoneal or pleural fluid) 76 (13%)
Incisional Biopsy 13 (2%)
Fine needle aspiration biopsy 12 (2%)
Excisional Biopsy 7 (1%)
Other method, specify: 2 (0%)

TUMOR_STATUS
WITH TUMOR 362 (63%)
TUMOR FREE 145 (25%)

GRADE
G3 484 (84%)
G2 68 (12%)
GX 10 (2%)
G1 6 (1%)
GB 2 (0%)
G4 1 (0%)

CLINICAL_STAGE
Stage IIIC 405 (71%)
Stage IV 88 (15%)
Stage IIIB 24 (4%)
Stage IIC 21 (4%)
Stage IC 11 (2%)
Stage IIIA 8 (1%)
Stage IIB 5 (1%)
Stage IIA 4 (1%)
Stage IB 3 (1%)
Stage IA 2 (0%)

HISTOLOGICAL_DIAGNOSIS
Serous Cystadenocarcinoma 574 (100%)

OS_STATUS
1:DECEASED 341 (59%)
0:LIVING 233 (41%)

DFS_STATUS
1:Recurred/Progressed 355 (62%)
0:Di

In [67]:
# Drop values with low frequency (only 2% are hispanic and 1 patient received neoadj tx)
# Tumor status was excluded as it was a follow-up observation rather than a primary observation
df.drop(["ETHNICITY", "HISTORY_NEOADJUVANT_TRTYN", "TUMOR_STATUS", "HISTOLOGICAL_DIAGNOSIS"], axis=1, inplace=True)
case_df = df.drop_duplicates(['case_id'])

In [65]:
df["RACE"].replace({
    "BLACK OR AFRICAN AMERICAN": "BLACK_OR_AA",
    "AMERICAN INDIAN OR ALASKA NATIVE": pd.NA,
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER": pd.NA
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["RACE"].value_counts()

WHITE          491
BLACK_OR_AA     33
ASIAN           20
Name: RACE, dtype: int64

In [68]:
df["GRADE"].replace({
    "G1": 0,
    "G2": 1,
    "G3": 2,
    "G4": 3,
    "GX": pd.NA,
    "GB": pd.NA
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["GRADE"].value_counts()

2    484
1     68
0      6
3      1
Name: GRADE, dtype: int64

In [69]:
df["CLINICAL_STAGE"].replace({
    "Stage IIC": 1,
    "Stage IIB": 1,
    "Stage IIA2": 1,
    "Stage IIA1": 1,
    "Stage IIA": 1,
    "Stage II": 1,

    "Stage IC": 0,
    "Stage IB2": 0,
    "Stage IB1": 0,
    "Stage IB": 0,
    "Stage IA2": 0,
    "Stage IA1": 0,
    "Stage IA": 0,
    "Stage I": 0,

    "Stage III": 2,
    "Stage IIIC": 2,
    "Stage IIIC1": 2,
    "Stage IIIC2": 2,
    "Stage IIIB": 2,
    "Stage IIIA": 2,

    "Stage IV": 3,
    "Stage IVA": 3,
    "Stage IVB": 3,
}, inplace=True)

case_df = df.drop_duplicates(['case_id'])
case_df["CLINICAL_STAGE"].value_counts()

2    437
3     88
1     30
0     16
Name: CLINICAL_STAGE, dtype: int64

In [70]:
df["stage_binary"] = df["CLINICAL_STAGE"]
df["stage_binary"].replace({
    0: 0,
    1: 0,
    2: 1,
    3: 1
}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["stage_binary"].value_counts()

1    525
0     46
Name: stage_binary, dtype: int64

In [71]:
df["METHOD_OF_INITIAL_SAMPLE_PROCUREMENT"].replace({
    'Tumor resection': "resection",
    'Other method, specify:': "other",
    'Fine needle aspiration biopsy': "other",
    'Cytology (e.g. Peritoneal or pleural fluid)': "other",
    'Incisional Biopsy': "other",
    'Excisional Biopsy': "excision"
}, inplace=True)
df.rename(columns={"METHOD_OF_INITIAL_SAMPLE_PROCUREMENT": "biopsy"}, inplace=True)
case_df = df.drop_duplicates(['case_id'])
case_df["biopsy"].value_counts()

resection    460
other        103
excision       7
Name: biopsy, dtype: int64

In [72]:
df = df.replace({
    "1:DECEASED": 1,
    "0:LIVING": 0,
    "0:DiseaseFree": 0,
    "1:Recurred/Progressed": 1,
    "YES": 1,
    "NO": 0,
    "Yes": 1,
    "No": 0
})
df["OS_STATUS"].value_counts(), df["DFS_STATUS"].value_counts()

(1    743
 0    488
 Name: OS_STATUS, dtype: int64,
 1    752
 0    284
 Name: DFS_STATUS, dtype: int64)

In [73]:
case_df = df.drop_duplicates(['case_id'])
for col in case_df.columns:
    if len(case_df[col].unique()) < 10:
        print()
        print(col)
        for i, j in dict(case_df[col].value_counts()).items():
            print(i, j, f"({int(round(j/len(case_df), 2)*100)}%)")


RACE
WHITE 491 (86%)
BLACK_OR_AA 33 (6%)
ASIAN 20 (3%)

biopsy
resection 460 (80%)
other 103 (18%)
excision 7 (1%)

GRADE
2 484 (84%)
1 68 (12%)
0 6 (1%)
3 1 (0%)

CLINICAL_STAGE
2 437 (76%)
3 88 (15%)
1 30 (5%)
0 16 (3%)

OS_STATUS
1 341 (59%)
0 233 (41%)

DFS_STATUS
1 355 (62%)
0 135 (24%)

group
1.0 470 (82%)
0.0 104 (18%)

stage_binary
1 525 (91%)
0 46 (8%)


In [77]:
df["OS_MONTHS"].astype(float).describe()

count    1231.000000
mean       38.168855
std        30.399401
min         0.260000
25%        14.910000
50%        32.060000
75%        54.135000
max       180.060000
Name: OS_MONTHS, dtype: float64

In [79]:
df["DFS_MONTHS"].dropna().astype(float).describe()

count    1036.000000
mean       21.142432
std        21.772104
min         0.530000
25%         9.000000
50%        14.680000
75%        25.330000
max       180.060000
Name: DFS_MONTHS, dtype: float64

In [81]:
df.isna().sum()

slide_id            0
case_id             0
RACE               63
biopsy              8
GRADE              31
AGE                 0
CLINICAL_STAGE      7
OS_STATUS           0
OS_MONTHS           0
DFS_STATUS        195
DFS_MONTHS        195
group               0
stage_binary        7
dtype: int64

## encoding

In [83]:
encode_list = []
for col in df.columns:
    if col not in ["case_id", "slide_id", "OS_STATUS", "DFS_STATUS", "OS_MONTHS", "DFS_MONTHS"]:
        if len(df[col].dropna().unique()) < 3:
            print("Binary:", col, df[col].dropna().unique())
        elif len(df[col].dropna().unique()) < 10:
            print("**Categ:", col, df[col].dropna().unique())
            encode_list.append(col)
        else:
            print("Cont:", col, df[col].mean())

**Categ: RACE ['WHITE' 'BLACK_OR_AA' 'ASIAN']
**Categ: biopsy ['resection' 'other' 'excision']
**Categ: GRADE [2 1 0 3]
Cont: AGE 59.82615759545085
**Categ: CLINICAL_STAGE [2 3 1 0]
Binary: group [0. 1.]
Binary: stage_binary [1 0]


In [84]:
def dummy_encode(df, col):
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    df.drop(col, axis=1, inplace=True)
    return df.reset_index(drop=True)
encoded_df = df.copy()
for col in encode_list:
    encoded_df = dummy_encode(encoded_df, col)
encoded_df.columns

  uniques = Index(uniques)
  uniques = Index(uniques)


Index(['slide_id', 'case_id', 'AGE', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS',
       'DFS_MONTHS', 'group', 'stage_binary', 'RACE_ASIAN', 'RACE_BLACK_OR_AA',
       'RACE_WHITE', 'biopsy_excision', 'biopsy_other', 'biopsy_resection',
       'GRADE_0', 'GRADE_1', 'GRADE_2', 'GRADE_3', 'CLINICAL_STAGE_0',
       'CLINICAL_STAGE_1', 'CLINICAL_STAGE_2', 'CLINICAL_STAGE_3'],
      dtype='object')

In [85]:
encoded_df.isna().sum()

slide_id              0
case_id               0
AGE                   0
OS_STATUS             0
OS_MONTHS             0
DFS_STATUS          195
DFS_MONTHS          195
group                 0
stage_binary          7
RACE_ASIAN            0
RACE_BLACK_OR_AA      0
RACE_WHITE            0
biopsy_excision       0
biopsy_other          0
biopsy_resection      0
GRADE_0               0
GRADE_1               0
GRADE_2               0
GRADE_3               0
CLINICAL_STAGE_0      0
CLINICAL_STAGE_1      0
CLINICAL_STAGE_2      0
CLINICAL_STAGE_3      0
dtype: int64

In [87]:
encoded_df = encoded_df.rename(columns={k: k.lower() for k in encoded_df.columns})
encoded_df.to_csv("./processed_clinical_data.csv", index=False)
encoded_df

Unnamed: 0,slide_id,case_id,age,os_status,os_months,dfs_status,dfs_months,group,stage_binary,race_asian,...,biopsy_other,biopsy_resection,grade_0,grade_1,grade_2,grade_3,clinical_stage_0,clinical_stage_1,clinical_stage_2,clinical_stage_3
0,TCGA-23-1120-01Z-00-DX1.59367B12-17F1-41AA-A6F...,TCGA-23-1120,60,0,4.27,0,4.27,0.0,1,0,...,0,1,0,0,1,0,0,0,1,0
1,TCGA-23-1120-01A-02-BS2.4cbf84a4-9b8f-4448-a06...,TCGA-23-1120,60,0,4.27,0,4.27,0.0,1,0,...,0,1,0,0,1,0,0,0,1,0
2,TCGA-23-1120-01A-01-BS1.7a9f0eb0-59d5-46ee-ae3...,TCGA-23-1120,60,0,4.27,0,4.27,0.0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,TCGA-25-1312-01Z-00-DX1.733EC7A7-0FC8-4DDC-B36...,TCGA-25-1312,69,1,1.02,,,0.0,1,0,...,0,1,0,0,1,0,0,0,0,1
4,TCGA-25-1312-01A-01-TS1.9d91008f-8c12-4e4e-aef...,TCGA-25-1312,69,1,1.02,,,0.0,1,0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,TCGA-04-1654-01A-01-BS1.f232c15b-e101-4cce-b78...,TCGA-04-1654,69,1,47.67,1,26.54,1.0,1,0,...,1,0,0,1,0,0,0,0,1,0
1227,TCGA-30-1860-01A-01-BS1.3f2f52d2-e19d-4b63-9bf...,TCGA-30-1860,58,1,44.88,1,13.04,1.0,1,0,...,0,1,0,0,1,0,0,0,1,0
1228,TCGA-13-1511-01A-01-TS1.4fb08454-15fa-40f8-9d2...,TCGA-13-1511,52,1,54.2,1,15.08,1.0,1,1,...,1,0,0,0,1,0,0,0,0,1
1229,TCGA-13-1511-01A-01-BS1.1db24a7e-5116-4fad-a83...,TCGA-13-1511,52,1,54.2,1,15.08,1.0,1,1,...,1,0,0,0,1,0,0,0,0,1


In [91]:
df_os = encoded_df.drop(["dfs_status", "dfs_months"], axis=1).copy().rename(columns={"os_status": "event", "os_months": "survival_months"})
df_dfs = encoded_df.drop(["os_status", "os_months"], axis=1).copy().rename(columns={"dfs_status": "event", "dfs_months": "survival_months"})
df_os.shape, df_os.isna().sum()

((1231, 21),
 slide_id            0
 case_id             0
 age                 0
 event               0
 survival_months     0
 group               0
 stage_binary        7
 race_asian          0
 race_black_or_aa    0
 race_white          0
 biopsy_excision     0
 biopsy_other        0
 biopsy_resection    0
 grade_0             0
 grade_1             0
 grade_2             0
 grade_3             0
 clinical_stage_0    0
 clinical_stage_1    0
 clinical_stage_2    0
 clinical_stage_3    0
 dtype: int64)

In [92]:
df_dfs = df_dfs[~df_dfs["event"].isna()]
df_dfs.isna().sum()

slide_id            0
case_id             0
age                 0
event               0
survival_months     0
group               0
stage_binary        5
race_asian          0
race_black_or_aa    0
race_white          0
biopsy_excision     0
biopsy_other        0
biopsy_resection    0
grade_0             0
grade_1             0
grade_2             0
grade_3             0
clinical_stage_0    0
clinical_stage_1    0
clinical_stage_2    0
clinical_stage_3    0
dtype: int64

In [93]:
df_os.to_csv("../datasets_csv/tcga_ov_os.csv", index=False)
df_dfs.to_csv("../datasets_csv/tcga_ov_dfs.csv", index=False)

# Inspect the Genetic Data

In [105]:
dname = "tcga_ov/tcga"

In [106]:
df_os.shape, df_dfs.shape

((1231, 21), (1036, 21))

## 1. Mut

In [107]:
df = pd.read_csv(f"./{dname}/data_mutations.txt", sep="\t")
# exclude silent mutations
df_woSilent = df[df['Variant_Classification'] != 'Silent']

pivot_df = pd.pivot_table(df_woSilent, values='Variant_Classification', index=['Tumor_Sample_Barcode'], columns=['Hugo_Symbol'], 
                          aggfunc='count', fill_value=0).reset_index()

pivot_df.iloc[:, 1:] = pivot_df.iloc[:, 1:].clip(upper=1)

mut_genes = list(pivot_df.columns[1:])

pivot_df["case_id"] = pivot_df["Tumor_Sample_Barcode"].str.rsplit("-", 1).str[0]
mut_df = pivot_df[["case_id"]+mut_genes].rename(columns={k: k+"_mut" for k in mut_genes}).reset_index(drop=True)

# cases with the follow-up data
mut_df = mut_df[mut_df["case_id"].isin(df_os["case_id"].values)]

print(mut_df["case_id"].duplicated().any())
print(mut_df.columns.duplicated().any())
print(mut_df.isna().any().any())
mut_df

False
False
False


Hugo_Symbol,case_id,A1CF_mut,A2M_mut,A2ML1_mut,AACS_mut,AADACL4_mut,AAED1_mut,AAMP_mut,AARS_mut,AARS2_mut,...,ZWILCH_mut,ZXDA_mut,ZXDB_mut,ZXDC_mut,ZYG11B_mut,ZZEF1_mut,ZZZ3_mut,hsa-mir-7162_mut,uc003vym.2_mut,uc003vyo.2_mut
0,TCGA-04-1331,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TCGA-04-1332,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TCGA-04-1336,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TCGA-04-1337,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TCGA-04-1338,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,TCGA-61-2104,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312,TCGA-61-2109,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
313,TCGA-61-2110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
314,TCGA-61-2111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
mut_df.to_csv(f"../datasets_csv/tcga_ov_os_mut.csv.zip", compression="zip", index=False)
mut_df_dfs = mut_df[mut_df["case_id"].isin(df_dfs["case_id"].values)]
mut_df_dfs.to_csv(f"../datasets_csv/tcga_ov_dfs_mut.csv.zip", compression="zip", index=False)

## 2. CNV

In [110]:
df = pd.read_csv(f"./{dname}/data_cna.txt", sep="\t")

duplicated_genes = df["Hugo_Symbol"].duplicated(keep=False)

# Separate duplicates and non-duplicates
df_duplicated = df[duplicated_genes].copy()
df_non_duplicated = df[~duplicated_genes].copy()

# Average the duplicated values
averaged_values = df_duplicated.groupby("Hugo_Symbol").median().reset_index()

df_final = pd.concat([df_non_duplicated, averaged_values], ignore_index=True)
df_final = df_final.sort_values("Hugo_Symbol").reset_index(drop=True)
df_final.drop(columns="Entrez_Gene_Id", inplace=True)
cnv_df = df_final.set_index("Hugo_Symbol").T.reset_index().rename(columns={"index": "case_id"})
cnv_df["case_id"] = cnv_df["case_id"].str.rsplit("-", 1).str[0]

cnv_cols = list(cnv_df.columns[1:])
cnv_df = cnv_df[["case_id"]+ cnv_cols].rename(columns={col: col+"_cnv" for col in cnv_cols})

cnv_df = cnv_df[cnv_df["case_id"].isin(df_os["case_id"].values)]
print(cnv_df["case_id"].duplicated().any())
print(cnv_df.columns.duplicated().any())
print(cnv_df.isna().any().any())
cnv_df

False
False
False


Hugo_Symbol,case_id,7SK|ENSG00000232512.2_cnv,7SK|ENSG00000249352.3_cnv,7SK|ENSG00000254144.2_cnv,7SK|ENSG00000260682.2_cnv,7SK|ENSG00000271765.1_cnv,7SK|ENSG00000271814.1_cnv,7SK|ENSG00000271818.1_cnv,A1BG_cnv,A1CF_cnv,...,snoZ185_cnv,snoZ247_cnv,snoZ278_cnv,snoZ40_cnv,snoZ5_cnv,snoZ6|ENSG00000252200.1_cnv,snoZ6|ENSG00000253067.1_cnv,snoZ6|ENSG00000264452.1_cnv,snoZ6|ENSG00000266692.1_cnv,snosnR66_cnv
0,TCGA-04-1331,0.0,-1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0
1,TCGA-04-1332,1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,0.0,...,-1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
2,TCGA-04-1335,-1.0,-1.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0
3,TCGA-04-1336,0.0,-1.0,2.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,...,0.0,1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-04-1337,0.0,1.0,0.0,-1.0,-1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,TCGA-61-2614,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,2.0,-1.0,...,1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0
575,TCGA-OY-A56P,0.0,0.0,1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,1.0,1.0,-1.0,0.0,0.0,0.0,1.0
576,TCGA-OY-A56Q,0.0,-1.0,1.0,1.0,0.0,0.0,-1.0,-1.0,1.0,...,1.0,1.0,-1.0,1.0,1.0,2.0,-1.0,0.0,0.0,-1.0
577,TCGA-VG-A8LO,0.0,-1.0,2.0,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [111]:
cnv_df.to_csv(f"../datasets_csv/tcga_ov_os_cnv.csv.zip", compression="zip", index=False)
cnv_df_dfs = cnv_df[cnv_df["case_id"].isin(df_dfs["case_id"].values)]
cnv_df_dfs.to_csv(f"../datasets_csv/tcga_ov_dfs_cnv.csv.zip", compression="zip", index=False)

## 3. mRNA-Seq

In [112]:
df = pd.read_csv(f"./{dname}/data_mrna_seq_v2_rsem.txt", sep="\t")
duplicated_genes = df["Hugo_Symbol"].duplicated(keep=False)

# Separate duplicates and non-duplicates
df_duplicated = df[duplicated_genes].copy()
df_non_duplicated = df[~duplicated_genes].copy()

# Average the duplicated values
averaged_values = df_duplicated.groupby("Hugo_Symbol").mean().reset_index()

df_final = pd.concat([df_non_duplicated, averaged_values], ignore_index=True)
df_final = df_final.sort_values("Hugo_Symbol").reset_index(drop=True)

recurrent_tm = [i for i in df_final.columns[2:] if i.rsplit("-")[-1] != "01"]
df_final.drop(columns="Entrez_Gene_Id", inplace=True)
df_final = df_final.drop(columns=recurrent_tm)
df_final = df_final.drop(df_final.index[df_final["Hugo_Symbol"].isna()])
df_cases = df_final.set_index("Hugo_Symbol").T.reset_index().rename(columns={"index": "case_id"})
df_cases["case_id"] = df_cases["case_id"].str.rsplit("-", 1).str[0]
rna_genes = list(df_cases.columns[1:])
rna_df = df_cases[["case_id"]+ rna_genes].rename(columns={col: col+"_rna" for col in rna_genes})
rna_df = rna_df[rna_df["case_id"].isin(df_os["case_id"].values)]

print(rna_df["case_id"].duplicated().any())
print(rna_df.columns.duplicated().any())
print(rna_df.isna().any().any())

rna_df

False
False
False


Hugo_Symbol,case_id,133K02_rna,5T4_rna,A-C1_rna,A1BG_rna,A1BG-AS1_rna,A1CF_rna,A2M_rna,A2M-AS1_rna,A2ML1_rna,...,ZWILCH_rna,ZWINT_rna,ZWS1_rna,ZXDA_rna,ZXDB_rna,ZXDC_rna,ZYG11A_rna,ZYG11B_rna,ZYX_rna,ZZZ3_rna
0,TCGA-04-1348,431.8365,331.8096,97.6069,66.4695,36.3243,0.000,5899.8279,118.4566,7.5289,...,928.9002,794.5684,373.7564,36.0312,235.2783,827.1041,5.6467,560.0968,15871.2019,475.9344
2,TCGA-04-1362,386.5598,1007.1073,90.6827,41.6412,23.2465,0.331,3350.4207,71.3613,5.6263,...,421.6182,605.3235,385.8978,93.3304,788.6746,1575.0325,56.5939,915.7627,6137.2982,803.8987
3,TCGA-04-1364,334.9785,345.5791,100.4936,187.0368,114.6008,0.000,1455.2316,67.8607,5.0883,...,913.9953,1079.9878,661.9006,13.9928,137.3836,1138.5030,13.1447,811.5809,5972.3706,444.8006
4,TCGA-04-1365,277.2337,483.9536,46.7078,23.9295,10.4957,0.000,3999.3792,52.1501,3.3148,...,646.5632,1882.7784,312.7919,35.8596,328.4617,1455.7782,7.2322,1031.7915,7211.9934,787.1026
5,TCGA-04-1514,316.0622,114.2775,24.1796,32.8123,20.1900,0.000,3224.5797,224.7582,3.7421,...,556.5630,1082.3258,483.0167,126.6552,959.1249,965.1698,20.4375,1082.6137,5867.2855,1149.1077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,TCGA-61-2113,261.0291,2215.4276,49.2706,70.4394,106.1623,0.000,5365.7972,23.5625,2.4461,...,540.3477,458.1113,584.2579,23.4122,159.3431,1277.1905,117.0612,776.4480,6787.8047,972.1324
299,TCGA-OY-A56P,226.8219,514.5571,35.7999,43.1736,47.8864,0.000,8773.5936,84.2153,98.3162,...,325.1728,303.2307,981.5590,99.3849,637.4523,2368.4052,28.5865,2075.0596,10157.0254,2826.3236
300,TCGA-OY-A56Q,169.9434,376.8744,25.3249,33.6854,25.3482,0.000,10006.7877,582.3492,269.5768,...,392.0826,625.4582,744.4185,48.9837,298.2339,1854.3819,119.6268,1157.2809,10968.3439,2466.5112
301,TCGA-VG-A8LO,291.8109,168.7737,34.1412,27.9604,17.9853,0.000,1888.1746,59.8084,4.5092,...,824.4915,602.6250,493.1154,72.4696,492.1491,1916.4184,6.4417,1528.9476,8124.0035,1297.3669


In [113]:
rna_df.to_csv(f"../datasets_csv/tcga_ov_os_rna.csv.zip", compression="zip", index=False)
rna_df_dfs = rna_df[rna_df["case_id"].isin(df_dfs["case_id"].values)]
rna_df_dfs.to_csv(f"../datasets_csv/tcga_ov_dfs_rna.csv.zip", compression="zip", index=False)

## 4. RPPA-Exp

In [114]:
df = pd.read_csv(f"./{dname}/data_rppa.txt", sep="\t")
print(df["Composite.Element.REF"].duplicated(keep=False).any())
df_final = df.sort_values("Composite.Element.REF").reset_index(drop=True)

recurrent_tm = [i for i in df_final.columns[2:] if i.rsplit("-")[-1] != "01"]
print(df_final.shape)
print("Recurrent tumors: ", len(recurrent_tm))
df_final = df_final.drop(columns=recurrent_tm)
print(df_final.shape)
df_final = df_final.drop(df_final.index[df_final["Composite.Element.REF"].isna()])
print(df_final.shape)
df_clean = df_final.dropna(thresh=int(df_final.shape[1]*.8))
print("After removing empty proteins in > 80%: ", df_clean.shape)
df_cases = df_clean.set_index("Composite.Element.REF").T.reset_index().rename(columns={"index": "case_id"})
df_cases["case_id"] = df_cases["case_id"].str.rsplit("-", 1).str[0]

pro_genes = list(df_cases.columns[1:])
pro_df = df_cases[["case_id"]+ pro_genes].rename(columns={col: col+"_pro" for col in pro_genes})
pro_df = pro_df[pro_df["case_id"].isin(df_os["case_id"].values)]
print("Cases with cli data: ", pro_df.shape)
print(pro_df["case_id"].duplicated().any())
print(pro_df.columns.duplicated().any())
print(pro_df.isna().any().any())

pro_df

False
(208, 437)
Recurrent tumors:  11
(208, 426)
(208, 426)
After removing empty proteins in > 80%:  (204, 426)
Cases with cli data:  (414, 205)
False
False
True


Composite.Element.REF,case_id,ACACA ACACB|ACC_pS79_pro,ACACA|ACC1_pro,ACVRL1|ACVRL1_pro,AKT1 AKT2 AKT3|Akt_pro,AKT1 AKT2 AKT3|Akt_pS473_pro,AKT1 AKT2 AKT3|Akt_pT308_pro,AKT1S1|PRAS40_pT246_pro,ANXA1|Annexin-1_pro,ANXA7|Annexin_VII_pro,...,XBP1|XBP1_pro,XRCC1|XRCC1_pro,XRCC5|Ku80_pro,YAP1|YAP_pro,YAP1|YAP_pS127_pro,YBX1|YB-1_pro,YBX1|YB-1_pS102_pro,YWHAB|14-3-3_beta_pro,YWHAE|14-3-3_epsilon_pro,YWHAZ|14-3-3_zeta_pro
0,TCGA-04-1335,0.050157,0.338735,0.028899,-0.362076,1.902311,1.241415,-0.128162,0.415967,0.045857,...,0.457778,-0.612430,0.273903,-0.246701,0.053472,0.234894,-0.110302,-0.008553,0.059854,-0.040743
1,TCGA-04-1336,-0.122685,-0.514174,-0.072171,0.124800,0.118800,0.036025,-0.068028,-0.600145,-0.070430,...,0.124803,-0.026779,-0.016853,-0.082328,0.424904,-0.322371,-0.106064,-0.115064,-0.075638,0.466381
2,TCGA-04-1338,-0.788799,-0.684709,0.265096,-0.116073,0.339617,0.454237,-0.418403,0.269226,0.147967,...,0.185344,-0.147648,-0.454878,0.415796,-0.049777,-0.204733,-0.283831,0.182304,0.255061,0.500384
3,TCGA-04-1341,0.544080,0.482856,0.474257,-0.160586,0.590164,0.227633,0.259963,-0.180727,0.128443,...,0.066478,0.025270,-0.266316,-0.361657,0.345984,0.413982,0.746081,0.184517,0.032904,0.126560
4,TCGA-04-1342,0.291188,0.317670,0.169391,-0.319387,1.022617,0.570207,-0.071549,-0.381018,0.073137,...,0.938900,-0.108158,-0.158992,-0.193000,0.300835,0.797873,-0.160503,-0.020604,-0.134749,-0.273026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,TCGA-61-2614,0.294133,0.042560,-0.072756,-0.003160,0.807530,0.453940,0.137475,0.124468,-0.065395,...,-0.042419,-0.282149,0.052371,-0.333259,-0.507655,0.086028,-0.002333,-0.277144,-0.066282,-0.476036
421,TCGA-OY-A56P,0.678777,0.746679,-0.136984,-0.508459,-0.097865,-0.177417,0.148403,0.210075,-0.120179,...,0.034178,0.389113,-0.013918,0.588104,0.698385,0.463170,-0.131119,-0.140199,-0.069463,-0.157745
422,TCGA-OY-A56Q,0.069934,-0.019771,-0.088824,-0.042172,0.078807,-0.063589,0.235932,0.683164,0.330481,...,-0.053597,0.891833,0.170997,0.341120,0.716418,-0.088501,-0.041030,-0.085609,0.089297,0.261936
423,TCGA-VG-A8LO,0.116778,-0.085238,-0.027367,-0.176764,0.325561,0.058855,0.120482,0.470819,0.367324,...,0.270166,-0.315340,-0.263736,0.262682,0.413353,-0.274155,-0.104356,0.029152,0.069522,0.189983


In [115]:
pro_df.to_csv(f"../datasets_csv/tcga_ov_os_pro.csv.zip", compression="zip", index=False)
pro_df_dfs = pro_df[pro_df["case_id"].isin(df_dfs["case_id"].values)]
pro_df_dfs.to_csv(f"../datasets_csv/tcga_ov_dfs_pro.csv.zip", compression="zip", index=False)

## 5. DNAm

In [116]:
df = pd.read_csv(f"./{dname}/data_methylation_hm27.txt", sep="\t")
duplicated_genes = df["Hugo_Symbol"].duplicated(keep=False)

# Separate duplicates and non-duplicates
df_duplicated = df[duplicated_genes].copy()
df_non_duplicated = df[~duplicated_genes].copy()

# Average the duplicated values
averaged_values = df_duplicated.groupby("Hugo_Symbol").mean().reset_index()

df_final = pd.concat([df_non_duplicated, averaged_values], ignore_index=True)
df_final = df_final.sort_values("Hugo_Symbol").reset_index(drop=True)
print(df_final.shape)

df_clean = df_final.dropna(thresh=int(df_final.shape[1]*.8))
print(df_clean.shape)
recurrent_tm = [i for i in df_final.columns[2:] if i.rsplit("-")[-1] != "01"]
print(len(recurrent_tm))
df_clean = df_clean.drop(columns=recurrent_tm)
df_clean = df_clean.drop(df_clean.index[df_clean["Hugo_Symbol"].isna()])
print(df_clean.shape)

df_cases = df_clean.set_index("Hugo_Symbol").T.reset_index().rename(columns={"index": "case_id"})
print(df_cases.columns.duplicated().any())
df_cases["case_id"] = df_cases["case_id"].str.rsplit("-", 1).str[0]
dna_genes = list(df_cases.columns[1:])
dna_df = df_cases[["case_id"]+ dna_genes].rename(columns={col: col+"_dna" for col in dna_genes})
dna_df = dna_df[dna_df["case_id"].isin(df_os["case_id"].values)]
print("Cases with cli data: ", dna_df.shape)
print(dna_df["case_id"].duplicated().any())
print(dna_df.columns.duplicated().any())
print(dna_df.isna().any().any())
dna_df

(14874, 593)
(14373, 593)
12
(14373, 581)
False
Cases with cli data:  (559, 14374)
False
False
True


Hugo_Symbol,case_id,A1BG_dna,A1CF_dna,A2BP1_dna,A2M_dna,A2ML1_dna,A4GALT_dna,A4GNT_dna,AAAS_dna,AACS_dna,...,ZSWIM1_dna,ZSWIM3_dna,ZSWIM7_dna,ZUFSP_dna,ZW10_dna,ZWILCH_dna,ZWINT_dna,ZYX_dna,ZZEF1_dna,ZZZ3_dna
0,TCGA-04-1331,0.981805,0.816419,0.196513,0.582584,0.469370,0.269578,0.806530,0.067518,0.098671,...,0.049296,0.448896,0.026618,0.034261,0.021411,0.048051,0.014654,0.012361,0.015735,0.018789
1,TCGA-04-1332,0.982110,0.565820,0.022767,0.552774,0.829701,0.296869,0.700986,0.057805,0.017430,...,0.038076,0.182496,0.015868,0.029626,0.022453,0.047327,0.013715,0.012677,0.011100,0.025092
2,TCGA-04-1335,0.957049,0.603784,0.681104,0.394409,0.669948,0.249889,0.820301,0.118457,0.011785,...,0.284387,0.393827,0.034591,0.134409,0.023022,0.046214,0.014255,0.008811,0.024669,0.072879
3,TCGA-04-1336,0.952807,,0.009456,0.466297,0.228725,0.386736,0.489553,0.083912,0.011477,...,0.347038,0.401215,0.032352,0.052731,0.025746,0.033919,0.014425,0.013426,0.014864,0.046425
4,TCGA-04-1337,0.986589,0.698739,0.170477,0.667076,0.896899,0.339014,0.928064,0.049442,0.034056,...,0.022741,0.387501,0.019475,0.022706,0.017513,0.034557,0.012591,0.012550,0.011353,0.016047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,TCGA-61-2111,0.954911,0.268565,0.390809,0.805110,0.733029,0.080300,0.619909,0.150095,0.011890,...,0.037573,0.031485,0.022357,0.014230,0.010066,0.034627,0.014369,0.007447,0.015375,0.013467
566,TCGA-61-2113,0.949360,0.571589,0.131924,0.809480,0.430048,0.172873,0.375198,0.125191,0.087687,...,0.060467,0.137753,0.038985,0.018988,0.018274,0.045965,0.017367,0.010630,0.021257,0.025499
567,TCGA-61-2612,0.921891,0.815743,0.240453,0.902233,0.535340,0.331119,0.868159,0.114558,0.177753,...,0.066533,0.266707,0.026484,0.025178,0.037937,0.045864,0.034713,0.116062,0.040508,0.054314
568,TCGA-61-2613,0.928122,0.834670,0.071404,0.762165,0.549529,0.292376,0.895432,0.099741,0.014708,...,0.100164,0.036415,0.043217,0.038979,0.034316,0.032903,0.025793,0.162768,0.063007,0.045219


In [117]:
dna_df.to_csv(f"../datasets_csv/tcga_ov_os_dna.csv.zip", compression="zip", index=False)
dna_df_dfs = dna_df[dna_df["case_id"].isin(df_dfs["case_id"].values)]
dna_df_dfs.to_csv(f"../datasets_csv/tcga_ov_dfs_dna.csv.zip", compression="zip", index=False)