# Datasets

This notebook is used to split data into smaller subsets.

## Set working directory

In [1]:
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)
print(os.getcwd())

/homes/dwiersma/Desktop/internship


## Load GPL570 microarray data

In [2]:
import tomllib
from src.data import Data

# load config
with open("config.toml", "rb") as file:
    config = tomllib.load(file)

# load the microarray data (set in config.toml)
data = Data(config)

ma_data = data.get_mm_with_tt()
# replace the spaces in the covariate's names with underscores
ma_data.columns = ma_data.columns.str.replace(" ", "_")
print(f"ma_data shape: {ma_data.shape}")

2024-01-10 20:51:03,796:src.log_manager:INFO:Loading data...
2024-01-10 20:51:54,095:src.log_manager:INFO:Data loaded in 50.2944 seconds


ma_data shape: (15403, 9710)


## Load TCGA data

In [3]:
# set the data location to the TCGA mixing matrix
config["data"]["locations"]["mixing_matrix"] = "data/corrected_mixing_matrix.tsv"
config["data"]["locations"]["tumor_types"] = "data/TCGA__Sample_To_TumorType_with_common_cancer_type_mapping_GEO_TCGA.tsv"

# set correct column names for the annotation data
config["data"]["columns"]["tumor_types"]["sample_name"] = "ID2"
config["data"]["columns"]["tumor_types"]["response"] = "TYPE3"

# load the tcga data
data = Data(config)

# replaces the "-" in sample names with "." to match the sample names in the mixing matrix and annotation data
data.replace_sample_sep(".")

tcga_data = data.get_mm_with_tt()
# replace the spaces in the covariate's names with underscores
tcga_data.columns = tcga_data.columns.str.replace(" ", "_")
print(f"tcga_data shape: {tcga_data.shape}")

2024-01-10 20:51:59,274:src.log_manager:INFO:Loading data...
2024-01-10 20:52:32,092:src.log_manager:INFO:Data loaded in 32.8152 seconds


tcga_data shape: (8862, 9710)


## Limited cancer types datasets

### Find cancer types that have sufficient samples

In [4]:
# some cancer type names end in whitespace, removing this whitespace here
tcga_data["response"] = tcga_data["response"].str.rstrip()

# calculate value counts for both datasets
tcga_counts = tcga_data.value_counts("response")
ma_counts = ma_data.value_counts("response")

# merge the two count dataframes
counts = tcga_counts.to_frame().merge(ma_counts, on="response", suffixes=["_tcga", "_ma"])

# remove the "normal" cancer type, since the definition of "normal" may differ
i = counts[counts.index == "Normal"].index
counts = counts.drop(i)

# remove cancer types that have fewer than min_count samples
# these are removed since a model will likely struggle to classify these samples correctly
min_count = 100
tt_to_keep = counts[(counts["count_tcga"] >= min_count) & (counts["count_ma"] >= min_count)].index
counts = counts[counts.index.isin(tt_to_keep)]

# add a column containing the total number of samples for a given cancer type
counts.loc[:,'total'] = counts.sum(axis=1)
# sort the counts by the total column
counts = counts.sort_values(by=["total"], ascending=False)

print(counts)

                            count_tcga  count_ma  total
response                                               
Colorectal adenocarcinoma          573      2350   2923
Acute myeloid leukemia             173      2094   2267
Breast cancer- ER+/HER2-           554      1549   2103
Lung adenocarcinoma                517       988   1505
HNSCC                              522       329    851
Cutaneous melanoma                 472       358    830
Breast cancer- TNBC                146       661    807
Renal clear cell carcinoma         534       224    758
Gastric adenocarcinoma             415       326    741
Prostate carcinoma                 498       215    713
Hepatocellular carcinoma           373       332    705
Lower grade glioma                 530       144    674
Breast cancer- ER+/HER2+           140       468    608
Glioblastoma multiforme            166       355    521
Ovarian carcinoma                  307       154    461


### Subset with a selected number of cancer types

In [10]:
n_tt = 15

selected_tt = counts.index[0:n_tt].to_list()

selected_ma_data = ma_data.loc[ma_data["response"].isin(selected_tt)]
selected_tcga_data = tcga_data.loc[tcga_data["response"].isin(selected_tt)]

print("selected GPL570 data:", selected_ma_data.shape)
print("selected TCGA data:", selected_tcga_data.shape)

selected GPL570 data: (10547, 9710)
selected TCGA data: (5920, 9710)


### Save subsets to disk

In [11]:
ma_path = f"data/subsets_sorted/ma_{n_tt}ct_min{min_count}s.csv"
tcga_path = f"data/subsets_sorted/tcga_{n_tt}ct_min{min_count}s.csv"

selected_ma_data.to_csv(ma_path)
selected_tcga_data.to_csv(tcga_path)