# Downloading individual files off AWS
To get the data for a single id `3A_001`, we used the following AWS CLI commands:

```bash
# List files
aws s3 ls --no-sign-request s3://chimera-challenge/v2/task3/data/3A_001/
# Copy the Clinical Data
aws s3 cp --no-sign-request s3://chimera-challenge/v2/task3/data/3A_001/3A_001_CD.json ./
# Copy the RNASeq Data
aws s3 cp --no-sign-request s3://chimera-challenge/v2/task3/data/3A_001/3A_001_CD.json ./
```
We also copied the `3A_001_HE.tif` file in the bucket's subfolder but that took too long and would be beyond the scope of this proof of concept.

In [5]:
import pandas as pd 
import random

In [6]:
metadata = pd.read_csv("metadata.csv")
print(len(metadata))

176


In [7]:
metadata.groupby("cohort").head()

Unnamed: 0,chimera_id_t3,cohort,tumor_type,wsi_quality,holes,tumor
0,3A_001,A,Primary,Good,Yes,Yes
1,3A_002,A,Primary,Good,No,Yes
2,3A_003,A,Primary,Good,No,Yes
3,3A_004,A,Primary,Good,Yes,Yes
4,3A_005,A,Primary,Good,Yes,Yes
126,3B_208,B,Primary,Good,Yes,Yes
127,3B_217,B,Primary,Good,Yes,Yes
128,3B_225,B,Primary,Poor,Yes,Yes
129,3B_227,B,Primary,Good,No,Yes
130,3B_229,B,Primary,Good,Yes,Yes


In [8]:
metadata_split_by_cohort = {cohort: group for cohort, group in metadata.groupby("cohort")}

In [9]:
print(metadata_split_by_cohort['A'])
print(len(metadata_split_by_cohort['A']))

    chimera_id_t3 cohort tumor_type wsi_quality holes tumor
0          3A_001      A    Primary        Good   Yes   Yes
1          3A_002      A    Primary        Good    No   Yes
2          3A_003      A    Primary        Good    No   Yes
3          3A_004      A    Primary        Good   Yes   Yes
4          3A_005      A    Primary        Good   Yes   Yes
..            ...    ...        ...         ...   ...   ...
121        3A_168      A    Primary        Good   Yes   Yes
122        3A_169      A    Primary        Good   Yes   Yes
123        3A_186      A    Primary        Good    No   Yes
124        3A_190      A    Primary        Good    No   Yes
125        3A_191      A    Primary        Good    No   Yes

[126 rows x 6 columns]
126


In [10]:
print(metadata_split_by_cohort['B'])
print(len(metadata_split_by_cohort['B']))

    chimera_id_t3 cohort tumor_type wsi_quality holes tumor
126        3B_208      B    Primary        Good   Yes   Yes
127        3B_217      B    Primary        Good   Yes   Yes
128        3B_225      B    Primary        Poor   Yes   Yes
129        3B_227      B    Primary        Good    No   Yes
130        3B_229      B    Primary        Good   Yes   Yes
131        3B_230      B    Primary        Good   Yes   Yes
132        3B_250      B    Primary        Good   Yes   Yes
133        3B_262      B    Primary        Good   Yes   Yes
134        3B_266      B    Primary        Good   Yes   Yes
135        3B_267      B    Primary        Good   Yes   Yes
136        3B_277      B    Primary        Good    No   Yes
137        3B_281      B    Primary        Good   Yes   Yes
138        3B_288      B    Primary        Good   Yes   Yes
139        3B_292      B    Primary        Good   Yes   Yes
140        3B_302      B    Primary        Good   Yes   Yes
141        3B_303      B    Primary     

In [15]:
cohortA = sorted(list(metadata_split_by_cohort['A']['chimera_id_t3']))
print(cohortA[:100])
print(cohortA[100:])

['3A_001', '3A_002', '3A_003', '3A_004', '3A_005', '3A_006', '3A_007', '3A_008', '3A_009', '3A_010', '3A_011', '3A_012', '3A_013', '3A_014', '3A_015', '3A_016', '3A_017', '3A_018', '3A_019', '3A_020', '3A_021', '3A_022', '3A_023', '3A_024', '3A_025', '3A_026', '3A_027', '3A_028', '3A_029', '3A_030', '3A_031', '3A_033', '3A_034', '3A_035', '3A_036', '3A_037', '3A_038', '3A_039', '3A_040', '3A_041', '3A_042', '3A_043', '3A_044', '3A_045', '3A_046', '3A_047', '3A_049', '3A_050', '3A_052', '3A_053', '3A_055', '3A_056', '3A_057', '3A_058', '3A_059', '3A_060', '3A_061', '3A_062', '3A_063', '3A_064', '3A_066', '3A_067', '3A_068', '3A_070', '3A_071', '3A_072', '3A_073', '3A_074', '3A_075', '3A_076', '3A_077', '3A_087', '3A_088', '3A_089', '3A_091', '3A_092', '3A_093', '3A_094', '3A_095', '3A_097', '3A_098', '3A_100', '3A_105', '3A_108', '3A_110', '3A_111', '3A_113', '3A_114', '3A_115', '3A_116', '3A_123', '3A_124', '3A_125', '3A_126', '3A_127', '3A_129', '3A_130', '3A_134', '3A_135', '3A_136']

In [13]:
print(sorted(list(metadata_split_by_cohort['B']['chimera_id_t3'])))

['3B_208', '3B_217', '3B_225', '3B_227', '3B_229', '3B_230', '3B_250', '3B_262', '3B_266', '3B_267', '3B_277', '3B_281', '3B_288', '3B_292', '3B_302', '3B_303', '3B_304', '3B_309', '3B_310', '3B_319', '3B_321', '3B_322', '3B_328', '3B_337', '3B_338', '3B_342', '3B_351', '3B_354', '3B_357', '3B_361', '3B_362', '3B_365', '3B_367', '3B_370', '3B_385', '3B_389', '3B_390', '3B_397', '3B_399', '3B_408', '3B_410', '3B_411', '3B_413', '3B_415', '3B_417', '3B_418', '3B_426', '3B_428', '3B_429', '3B_431']


split cohort A (n=126) into hospital X, cohort B (n=50) into hospital Y


In [16]:
clinical_data = pd.read_csv("clinical_data.csv")

In [17]:
for col in clinical_data.columns:
    unique_values = clinical_data[col].unique()
    print(f"Column '{col}' unique values: {unique_values}")

Column 'age' unique values: [72 59 73 63 66 64 57 69 74 77 70 53 55 65 80 83 76 79 71 86 75 67 78 68
 49 61 56 60 88 81 47 62 45 85 54 52 37 82 84 58 87]
Column 'sex' unique values: ['Male' 'Female']
Column 'smoking' unique values: ['No' 'Yes' '-1']
Column 'tumor' unique values: ['Primary']
Column 'stage' unique values: ['T1HG' 'TaHG']
Column 'substage' unique values: ['T1m' 'T1e' '-1']
Column 'grade' unique values: ['G3' 'G2']
Column 'reTUR' unique values: ['Yes' 'No']
Column 'LVI' unique values: ['No' 'Yes' '-1']
Column 'variant' unique values: ['UCC' 'UCC + Variant']
Column 'EORTC' unique values: ['Highest risk' 'High risk']
Column 'no_instillations' unique values: [24. 27. 15.  6. 23.  9. 30. 17. 32. 12. 46. 26. 41. 20. 21. 25.  8. 11.
 18. 14. 10. 45. 39. 36. 16. 33. 38.  5. 42. 19. -1.]
Column 'BRS' unique values: ['BRS2' 'BRS3' 'BRS1']
Column 'progression' unique values: [0 1]
Column 'Time_to_prog_or_FUend' unique values: [110 126  18 155  34 114 108  39  59 124   5  10  93  48 