In [1]:
import pandas as pd
import numpy as np

In [2]:
clin_df = pd.read_csv("Data/data/dataframes/clin_df.csv")
geno_df = pd.read_csv("Data/data/dataframes/genomic_df.csv")
ct_df = pd.read_csv("Data/data/dataframes/ct_df.csv")
hne_df = pd.read_csv("Data/data/dataframes/hne_df.csv")

In [3]:
clin_df.shape, geno_df.shape, ct_df.shape, hne_df.shape

((444, 13), (383, 29), (338, 13), (283, 3))

### Clean CLinical data

In [4]:
clin_cols=['Complete gross resection', 'stage', 'age', 'Type of surgery', 'adnexal_lesion', 'omental_lesion', 'Received PARPi']

In [5]:
clin_df['Patient ID'] = clin_df['Patient ID'].astype(str)
clin_df = clin_df.set_index('Patient ID')

### Clean Genomic data

In [7]:
geno_df['Patient ID'] = geno_df['Patient ID'].astype(str)
geno_df = geno_df.set_index('Patient ID')

In [8]:
geno_df.loc[geno_df['HRD status'] == 'HRP', 'hrd_status'] = False
geno_df.loc[geno_df['HRD status'] == 'HRD', 'hrd_status'] = True
geno_df = geno_df.dropna(subset=['HRD status'])
geno_df.hrd_status = geno_df.hrd_status.astype(bool)
# geno_df = genom[['hrd_status']]

### Clean CT data

In [18]:
ct_df['Patient ID'] = ct_df['Patient ID'].astype(str)
ct_df = ct_df.set_index('Patient ID')

### Clean H&E data

In [19]:
hne_df['Patient ID'] = hne_df['Patient ID'].astype(str)
hne_df = hne_df.set_index('Patient ID')

### Merge dfs

In [9]:
main_df = clin_df[clin_cols]
main_df.shape

(444, 7)

In [10]:
main_df.head()

Unnamed: 0_level_0,Complete gross resection,stage,age,Type of surgery,adnexal_lesion,omental_lesion,Received PARPi
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,True,IV,61.160849,NACT-IDS,True,True,True
2,True,III,87.838467,NACT-IDS,,,False
3,True,III,60.251882,NACT-IDS,True,True,False
4,True,IV,64.566735,NACT-IDS,True,True,False
5,True,IV,74.737851,NACT-IDS,True,True,False


In [11]:
main_df.index.nunique()

444

In [13]:
main_df = main_df.merge(geno_df[['hrd_status']], how='left', left_index=True, right_index=True)

In [14]:
main_df.shape

(444, 8)

In [15]:
main_df.head()

Unnamed: 0_level_0,Complete gross resection,stage,age,Type of surgery,adnexal_lesion,omental_lesion,Received PARPi,hrd_status
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,True,IV,61.160849,NACT-IDS,True,True,True,False
2,True,III,87.838467,NACT-IDS,,,False,False
3,True,III,60.251882,NACT-IDS,True,True,False,True
4,True,IV,64.566735,NACT-IDS,True,True,False,False
5,True,IV,74.737851,NACT-IDS,True,True,False,False


In [17]:
main_df[~main_df.hrd_status.isin([True, False])]

Unnamed: 0_level_0,Complete gross resection,stage,age,Type of surgery,adnexal_lesion,omental_lesion,Received PARPi,hrd_status
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
017,True,IV,72.109514,NACT-IDS,False,True,False,
033,True,IV,69.541410,NACT-IDS,True,True,False,
040,True,IV,77.286790,NACT-IDS,True,True,False,
041,True,IV,71.906913,NACT-IDS,True,True,False,
087,True,IV,74.661191,NACT-IDS,True,True,False,
...,...,...,...,...,...,...,...,...
TCGA-61-2018,,I,62.000000,,True,False,,
TCGA-61-2087,,I,49.000000,,True,True,,
TCGA-OY-A56P,,III,48.000000,,,,,
TCGA-OY-A56Q,,II,78.000000,,True,False,,


In [20]:
main_df = main_df.merge(ct_df, how='left', left_index=True, right_index=True)

In [21]:
main_df = main_df.merge(hne_df, how='left', left_index=True, right_index=True)

In [23]:
main_df.head()

Unnamed: 0_level_0,Complete gross resection,stage,age,Type of surgery,adnexal_lesion,omental_lesion,Received PARPi,hrd_status,CT Site,CT Vend.,...,mA,Segmenting Radiologist,R_ovary,L_ovary,Omentum,image_path_x,segmentation_path,windowed_image_path,image_path_y,n_foreground_tiles
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,True,IV,61.160849,NACT-IDS,True,True,True,False,External,GE,...,range,EA,True,True,True,data/ct/mskcc/347437/ctimg.mhd,data/ct/mskcc/347437/segmentation.mha,data/ct/windowed_scans/347437/windowed_ctimg.mhd,data/hne/mskcc/689141.svs,9941.0
2,True,III,87.838467,NACT-IDS,,,False,False,,,...,,,,,,,,,data/hne/mskcc/613631.svs,3781.0
3,True,III,60.251882,NACT-IDS,True,True,False,True,MSKCC,GE,...,range,EA,True,True,True,data/ct/mskcc/347439/ctimg.mhd,data/ct/mskcc/347439/segmentation.mha,data/ct/windowed_scans/347439/windowed_ctimg.mhd,data/hne/mskcc/548343.svs,56634.0
4,True,IV,64.566735,NACT-IDS,True,True,False,False,External,GE,...,range,EA,False,True,True,data/ct/mskcc/347440/ctimg.mhd,data/ct/mskcc/347440/segmentation.mha,data/ct/windowed_scans/347440/windowed_ctimg.mhd,data/hne/mskcc/4172404.svs,17466.0
5,True,IV,74.737851,NACT-IDS,True,True,False,False,External,GE,...,range,EA,False,True,True,data/ct/mskcc/347441/ctimg.mhd,data/ct/mskcc/347441/segmentation.mha,data/ct/windowed_scans/347441/windowed_ctimg.mhd,data/hne/mskcc/4172430.svs,55151.0


In [25]:
main_df.rename({
    "image_path_x": "image_path_ct",
    "image_path_y": "image_path_hne"}, axis=1, inplace=True)

In [26]:
main_df.sample(10)

Unnamed: 0_level_0,Complete gross resection,stage,age,Type of surgery,adnexal_lesion,omental_lesion,Received PARPi,hrd_status,CT Site,CT Vend.,...,mA,Segmenting Radiologist,R_ovary,L_ovary,Omentum,image_path_ct,segmentation_path,windowed_image_path,image_path_hne,n_foreground_tiles
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
242,True,IV,42.477755,PDS,True,True,True,False,External,Toshiba,...,172.0,YL,True,True,True,data/ct/mskcc/337190/ctimg.mhd,data/ct/mskcc/337190/segmentation.mha,data/ct/windowed_scans/337190/windowed_ctimg.mhd,data/hne/mskcc/3411777.svs,6130.0
164,True,III,78.614648,NACT-IDS,True,True,False,False,External,GE,...,644.0,IN,True,True,True,data/ct/mskcc/333027/ctimg.mhd,data/ct/mskcc/333027/segmentation.mha,data/ct/windowed_scans/333027/windowed_ctimg.mhd,,
073,True,III,73.848049,NACT-IDS,,,False,False,,,...,,,,,,,,,data/hne/mskcc/4068362.svs,28604.0
194,,IV,67.863107,PDS,True,False,True,True,MSKCC,GE,...,380.0,YL,True,True,False,data/ct/mskcc/330685/ctimg.mhd,data/ct/mskcc/330685/segmentation.mha,data/ct/windowed_scans/330685/windowed_ctimg.mhd,,
198,True,III,67.56742,PDS,True,True,False,False,External,GE,...,99.0,YL,True,False,True,data/ct/mskcc/330709/ctimg.mhd,data/ct/mskcc/330709/segmentation.mha,data/ct/windowed_scans/330709/windowed_ctimg.mhd,data/hne/mskcc/1263900.svs,124901.0
TCGA-57-1586,,III,66.0,,,,,,,,...,,,,,,,,,data/hne/tcga/TCGA-57-1586-01Z-00-DX1.5C85A9B1...,11019.0
149,True,IV,58.108145,PDS,True,True,False,True,MSKCC,GE,...,342.0,YL,True,True,True,data/ct/mskcc/330699/ctimg.mhd,data/ct/mskcc/330699/segmentation.mha,data/ct/windowed_scans/330699/windowed_ctimg.mhd,,
114,False,IV,77.002053,NACT-IDS,True,True,False,False,External,GE,...,219.0,YL,True,True,True,data/ct/mskcc/380926/ctimg.mhd,data/ct/mskcc/380926/segmentation.mha,data/ct/windowed_scans/380926/windowed_ctimg.mhd,data/hne/mskcc/923864.svs,62470.0
030,False,IV,63.701574,NACT-IDS,True,True,False,False,MSKCC,GE,...,379.0,EA,True,True,True,data/ct/mskcc/347467/ctimg.mhd,data/ct/mskcc/347467/segmentation.mha,data/ct/windowed_scans/347467/windowed_ctimg.mhd,data/hne/mskcc/394140.svs,2336.0
TCGA-57-1583,,III,57.0,,,,,False,,,...,,,,,,,,,data/hne/tcga/TCGA-57-1583-01Z-00-DX1.88ACB045...,20483.0
