In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path

In [2]:
status = pd.read_csv('../data/TCGA_OV_HRDstatus.txt', sep='\t')
status.head()

Unnamed: 0,PatientID,Signature.1,Signature.2,Signature.3,Signature.5,Signature.8,Signature.13,Microhomology2,Microhomology2ratio,Del/ins-ratio,...,SBS39,SBS40,SBS41,ID1,ID2,ID4,ID8,HRDetect,BRCA-status_detailed,BRCA-status
0,TCGA-04-1331,30.1,0.0,185.5,85.5,21.0,0.0,9,0.428571,1.909091,...,0.0,0.0,48.603,0.0,0.0,0.0,32.0,0.974783,BRCA2_somatic_mutated+LOH,BRCA2_deficient
1,TCGA-04-1332,21.9,10.4,40.4,87.3,0.0,0.0,1,0.090909,2.285714,...,0.0,25.438,38.938,0.0,0.0,0.0,6.0,0.161715,intact,intact
2,TCGA-04-1336,13.6,0.0,80.9,49.4,0.0,0.0,8,0.888889,2.615385,...,0.0,44.129,0.0,1.515,0.0,0.0,11.485,0.974439,BRCA2_germline_mutated+LOH,BRCA2_deficient
3,TCGA-04-1341,0.0,0.0,50.7,42.7,0.0,0.0,0,0.0,1.2,...,0.0,0.0,31.444,0.0,0.0,0.0,11.0,0.088818,BRCA1_germline_mutated(UNK)+LOH,intact
4,TCGA-04-1342,0.0,0.0,545.4,419.9,0.0,0.0,3,0.0625,1.066667,...,0.0,0.0,0.0,9.176,0.0,0.0,83.824,0.168023,BRCA2_somatic_mutated(UNK)+LOH,intact


In [3]:
filtered_status = status[['PatientID', 'HRDetect', 'BRCA-status']].copy()
filtered_status.head()

Unnamed: 0,PatientID,HRDetect,BRCA-status
0,TCGA-04-1331,0.974783,BRCA2_deficient
1,TCGA-04-1332,0.161715,intact
2,TCGA-04-1336,0.974439,BRCA2_deficient
3,TCGA-04-1341,0.088818,intact
4,TCGA-04-1342,0.168023,intact


In [4]:
filtered_status.isna().sum()

PatientID      0
HRDetect       0
BRCA-status    0
dtype: int64

In [5]:
Counter(filtered_status['BRCA-status'])

Counter({'intact': 268,
         'BRCA1_deficient': 101,
         'BRCA2_deficient': 39,
         'quiescent': 17})

In [6]:
len(np.unique(filtered_status['PatientID'])), len(filtered_status['PatientID'])

(425, 425)

In [7]:
paths = list(Path('../data/tcga').glob('*.svs'))
paths[:5]

[PosixPath('../data/tcga/TCGA-29-2428-01A-01-BS1.e5b6420f-0b66-4b78-b396-6bf8a075f55b.svs'),
 PosixPath('../data/tcga/TCGA-24-1463-01A-01-BS1.18fd488c-a7e7-4254-9e9f-2fd4b0cf1496.svs'),
 PosixPath('../data/tcga/TCGA-09-2045-11A-01-BS1.893d9759-2b40-4b68-9e57-2e80dffe9a58.svs'),
 PosixPath('../data/tcga/TCGA-09-1659-01B-01-BS1.e8c4fbf1-6540-4e8e-b163-58d009066a2e.svs'),
 PosixPath('../data/tcga/TCGA-10-0930-01A-01-TS1.3ec1ee42-18e6-4a31-96de-f5852709af5b.svs')]

In [8]:
slide_info = np.array([['-'.join(path.name.split('-')[:3]), path.name.split('.')[0][-3:]] for path in paths])
slide_names, slide_types = slide_info[:, 0], slide_info[:, 1]

print(slide_types[:3])
print(slide_names[:3])

['BS1' 'BS1' 'BS1']
['TCGA-29-2428' 'TCGA-24-1463' 'TCGA-09-2045']


In [9]:
Counter(slide_types)

Counter({'BS1': 663, 'TS1': 652, 'DX1': 106, 'BS2': 54, 'TSA': 5, 'DX2': 1})

In [10]:
len(slide_names), len(np.unique(slide_names))

(1481, 590)

In [11]:
# NOTE: Check how many slides are in the status file
len(set(slide_names).intersection(set(filtered_status['PatientID'])))

425

In [12]:
# NOTE: check how many slides has `DX` in their name
filtered_slide_names = [name for name, slide_type in zip(slide_names, slide_types) if 'DX' in slide_type]
len(filtered_slide_names)

107

In [13]:
# NOTE: check how many filtered slides are in the status file
len(set(filtered_slide_names).intersection(set(filtered_status['PatientID'])))

72

In [28]:
# NOTE: update the dataframe with the slide names corresponding to the status file
filtered_status = status[['PatientID', 'HRDetect', 'BRCA-status']].copy()

filtered_status['slide_paths'] = [None] * len(filtered_status)
filtered_status['slide_types'] = [None] * len(filtered_status)

for ind, row in filtered_status.iterrows():
    filtered_status.loc[ind, 'slide_paths'] = ','.join([str(path).split('/')[-1].replace('.svs', '') for path in paths if row['PatientID'] in str(path)])
    filtered_status.loc[ind, 'slide_types'] = ','.join([slide_type for name, slide_type in zip(slide_names, slide_types) if row['PatientID'] in name])

In [29]:
filtered_status

Unnamed: 0,PatientID,HRDetect,BRCA-status,slide_paths,slide_types
0,TCGA-04-1331,0.974783,BRCA2_deficient,TCGA-04-1331-01A-01-BS1.27aaf831-a80b-4a55-a23...,"BS1,TS1"
1,TCGA-04-1332,0.161715,intact,TCGA-04-1332-01A-01-TS1.7d1ae336-5e74-4d12-904...,"TS1,BS1"
2,TCGA-04-1336,0.974439,BRCA2_deficient,TCGA-04-1336-01A-01-BS1.d029552a-1480-4083-846...,"BS1,TS1"
3,TCGA-04-1341,0.088818,intact,TCGA-04-1341-01A-01-BS1.85fe6a7e-fd57-4f26-a84...,"BS1,TS1"
4,TCGA-04-1342,0.168023,intact,TCGA-04-1342-01A-01-TS1.66421418-fc94-4215-9ab...,"TS1,BS1"
...,...,...,...,...,...
420,TCGA-61-2614,0.844115,intact,TCGA-61-2614-01A-01-TS1.9649012a-36a2-4680-919...,"TS1,BS1"
421,TCGA-OY-A56P,0.146777,intact,TCGA-OY-A56P-01A-01-TS1.8A88DA85-D084-483A-955...,"TS1,DX1"
422,TCGA-OY-A56Q,0.009659,intact,TCGA-OY-A56Q-01Z-00-DX1.F1556F26-8845-4962-900...,"DX1,TS1"
423,TCGA-VG-A8LO,0.990892,BRCA1_deficient,TCGA-VG-A8LO-01A-02-DX2.9B58474C-DAC0-4D45-B13...,"DX2,DX1"


In [30]:
# NOTE: do the same for the whole status file
updated_status = status.copy()

updated_status['slide_types'] = [None] * len(updated_status)
updated_status['slide_paths'] = [None] * len(updated_status)
updated_status['FFPE'] = [None] * len(updated_status)

for ind, row in updated_status.iterrows():
    updated_status.loc[ind, 'slide_types'] = ','.join([slide_type for name, slide_type in zip(slide_names, slide_types) if row['PatientID'] in name])
    updated_status.loc[ind, 'slide_paths'] = ','.join([str(path).split('/')[-1].replace('.svs', '')  for path in paths if row['PatientID'] in str(path)])

    updated_status.loc[ind, 'FFPE'] = True if 'DX1' in updated_status.loc[ind, 'slide_types'] or 'DX2' in updated_status.loc[ind, 'slide_types'] else False

In [31]:
updated_status[['PatientID', 'BRCA-status', 'slide_types', 'slide_paths', 'FFPE']].loc[updated_status['FFPE'] == True]

Unnamed: 0,PatientID,BRCA-status,slide_types,slide_paths,FFPE
142,TCGA-13-A5FU,intact,"DX1,TSA",TCGA-13-A5FU-01Z-00-DX1.9AD9E4B9-3F87-4879-BC0...,True
152,TCGA-23-1021,BRCA2_deficient,"TS1,BS1,DX1",TCGA-23-1021-01B-01-TS1.80c2d0cc-5647-4f11-973...,True
153,TCGA-23-1022,BRCA1_deficient,"DX1,BS2,TS1,BS1",TCGA-23-1022-01Z-00-DX1.AF9E523E-CB0F-4AB5-AD4...,True
154,TCGA-23-1027,BRCA1_deficient,"BS2,DX1,BS1",TCGA-23-1027-01A-02-BS2.08a84ac3-d0b4-4056-b4d...,True
155,TCGA-23-1029,BRCA2_deficient,"BS1,DX1,TS1",TCGA-23-1029-01B-01-BS1.f0bd82cb-50ca-4cf3-bd2...,True
...,...,...,...,...,...
367,TCGA-5X-AA5U,intact,"TSA,DX1",TCGA-5X-AA5U-01A-01-TSA.9D54B87F-491D-4482-A0D...,True
421,TCGA-OY-A56P,intact,"TS1,DX1",TCGA-OY-A56P-01A-01-TS1.8A88DA85-D084-483A-955...,True
422,TCGA-OY-A56Q,intact,"DX1,TS1",TCGA-OY-A56Q-01Z-00-DX1.F1556F26-8845-4962-900...,True
423,TCGA-VG-A8LO,BRCA1_deficient,"DX2,DX1",TCGA-VG-A8LO-01A-02-DX2.9B58474C-DAC0-4D45-B13...,True


In [32]:
updated_status[['PatientID', 'BRCA-status', 'HRDetect', 'slide_types', 'slide_paths', 'FFPE']].loc[updated_status['FFPE'] == False]

Unnamed: 0,PatientID,BRCA-status,HRDetect,slide_types,slide_paths,FFPE
0,TCGA-04-1331,BRCA2_deficient,0.974783,"BS1,TS1",TCGA-04-1331-01A-01-BS1.27aaf831-a80b-4a55-a23...,False
1,TCGA-04-1332,intact,0.161715,"TS1,BS1",TCGA-04-1332-01A-01-TS1.7d1ae336-5e74-4d12-904...,False
2,TCGA-04-1336,BRCA2_deficient,0.974439,"BS1,TS1",TCGA-04-1336-01A-01-BS1.d029552a-1480-4083-846...,False
3,TCGA-04-1341,intact,0.088818,"BS1,TS1",TCGA-04-1341-01A-01-BS1.85fe6a7e-fd57-4f26-a84...,False
4,TCGA-04-1342,intact,0.168023,"TS1,BS1",TCGA-04-1342-01A-01-TS1.66421418-fc94-4215-9ab...,False
...,...,...,...,...,...,...
416,TCGA-61-2111,intact,0.055567,"TS1,BS1,BS1,TS1",TCGA-61-2111-11A-01-TS1.9160fa41-6476-47cf-b93...,False
417,TCGA-61-2113,intact,0.009278,"TS1,BS1",TCGA-61-2113-01A-01-TS1.91aa5d6b-d95f-4465-b1a...,False
418,TCGA-61-2612,intact,0.763266,"BS1,TS1,TS1,BS1",TCGA-61-2612-01A-01-BS1.09aaa7ac-b1d2-4859-898...,False
419,TCGA-61-2613,intact,0.994690,"BS1,TS1,TS1,BS1",TCGA-61-2613-01A-01-BS1.a03e1b6b-eb1f-4956-8ed...,False


In [33]:
# NOTE:
# negative samples: quiescent
# positive samples: (BRCA1_deficient OR BRCA2_deficient) AND HRDetect >0.7
# intact   - 0
# negative - 1
# positive - 2

dataset = []
for _, row in filtered_status.iterrows():
    for slide_id, slide_type in zip(row['slide_paths'].split(','), row['slide_types'].split(',')):
        if row['BRCA-status'] == 'BRCA1_deficient' or row['BRCA-status'] == 'BRCA2_deficient':
            if row['HRDetect'] > 0.7:
                    brca_status = 'positive'
        elif row['BRCA-status'] == 'quiescent':
                    brca_status = 'negative'
        elif row['BRCA-status'] == 'intact':
                    brca_status = 'intact'
        sample = {
            'case_id': row['PatientID'],
            'slide_id': slide_id,
            'slide_type': slide_type,
            'brca_status': brca_status,
            'hrd_score': row['HRDetect']
        }

        dataset.append(sample)

dataset = pd.DataFrame(dataset)
dataset = dataset.sample(frac=1, random_state=137).reset_index(drop=True)
dataset.head()

Unnamed: 0,case_id,slide_id,slide_type,brca_status,hrd_score
0,TCGA-36-2532,TCGA-36-2532-01A-01-BS1.acb0df6f-8ce4-4af7-a5d...,BS1,negative,0.005919
1,TCGA-30-1856,TCGA-30-1856-01A-01-TS1.81cdd81e-5115-405e-875...,TS1,intact,0.142597
2,TCGA-57-1994,TCGA-57-1994-01Z-00-DX1.A0798185-30C5-4C32-B1A...,DX1,intact,0.418119
3,TCGA-29-1695,TCGA-29-1695-01A-01-BS1.06ff1b46-dadd-4aa2-a60...,BS1,intact,0.035185
4,TCGA-61-2092,TCGA-61-2092-01A-01-TS1.060203fd-43c4-4e4c-8b3...,TS1,intact,0.002039


In [27]:
dataset.to_csv('../data/brca_dataset.csv', index=False)