# DeepPATH02a

This notebook is used for preparing the csv file for sorting the tiles in step `02a` of DeepPath.

In [2]:
# dependencies
from pathlib import Path
import pandas as pd

In [2]:
# Full path to the output csv file
csv_path = r'E:\Project\External Cohort\classes.csv'
# Full path to the .svs files, will be used in glob
glob_dir = r'E:\Project\External Cohort\Images\Raw'
glob_pattern = r'*\*.svs'
svs_paths = list(Path(glob_dir).glob(glob_pattern))
svs_paths[0:10]

[WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-00156_MSB-00156-01-05.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-00587_MSB-00587-01-01.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-01798_MSB-01798-02-02.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-02137_MSB-02137-01-02.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-02664_MSB-02664-01-05.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-02664_MSB-02664-01-10.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-04856_MSB-04856-01-02.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-05650_MSB-05650-01-02.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB-05960_MSB-05960-01-01.svs'),
 WindowsPath('E:/Project/External Cohort/Images/Raw/CMB-BRCA/CMB-BRCA_MSB

In [3]:
# this is TCGA abbreviation
TCGA_dict = {
    'BLCA': 0,  # none
    'BRCA': 1,  # CMB-BRCA
    'CESC': 2,  # none
    'COAD': 3,  # CPTAC-COAD
    'LUAD': 4,  # CPTAC-LUAD
    'LUSC': 5,  # CPTAC-LSCC
    'PRAD': 6,  # CMB-PCA
    'SKCM': 7,  # CPTAC-CM
    'STAD': 8,  # none
    'UCEC': 9   # CPTAC-UCEC
}
# converter function >> converts file name to labels
def converter(filename):
    match filename:
        case _ if 'CMB-BRCA' in filename:
            output = 'BRCA'
        case _ if 'CMB-PCA' in filename:
            output = 'PRAD'
        case _ if 'CPTAC-CM' in filename:
            output = 'SKCM'
        case _ if 'CPTAC-COAD' in filename:
            output = 'COAD'
        case _ if 'CPTAC-LSCC' in filename:
            output = 'LUSC'
        case _ if 'CPTAC-LUAD' in filename:
            output = 'LUAD'
        case _ if 'CPTAC-UCEC' in filename:
            output = 'UCEC'
        case _:
            print(f"{filename} is not properly translated, appending NULL.")
            output = 'NULL'

    return output

# function to create pd.df from a list of Path objects
def df_paths(paths_list: [Path]):
    file_list = []
    labels = []
    indices = []

    for svs in paths_list:
        filename = svs.stem
        label = converter(svs.stem)
        index = TCGA_dict[label]

        file_list.append(filename)
        labels.append(label)
        indices.append(index)

    df = pd.DataFrame({'sample': file_list,
         'label': labels,
         'index': indices},
    )

    return df


In [4]:
# this block will create the csv file.
class_df = df_paths(svs_paths)
Path(csv_path).parent.mkdir(parents=True, exist_ok=True)
class_df.to_csv(csv_path, index=False, header=False, sep=' ')

# show head of df
class_df.head(10)

Unnamed: 0,sample,label,index
0,CMB-BRCA_MSB-00156_MSB-00156-01-05,BRCA,1
1,CMB-BRCA_MSB-00587_MSB-00587-01-01,BRCA,1
2,CMB-BRCA_MSB-01798_MSB-01798-02-02,BRCA,1
3,CMB-BRCA_MSB-02137_MSB-02137-01-02,BRCA,1
4,CMB-BRCA_MSB-02664_MSB-02664-01-05,BRCA,1
5,CMB-BRCA_MSB-02664_MSB-02664-01-10,BRCA,1
6,CMB-BRCA_MSB-04856_MSB-04856-01-02,BRCA,1
7,CMB-BRCA_MSB-05650_MSB-05650-01-02,BRCA,1
8,CMB-BRCA_MSB-05960_MSB-05960-01-01,BRCA,1
9,CMB-BRCA_MSB-06586_MSB-06586-03-01,BRCA,1


In [5]:
# use this block to check the path
with open(csv_path, 'r') as f:
    read_df = pd.read_csv(f, header=None, sep=' ')

read_df

Unnamed: 0,0
0,CMB-BRCA_MSB-00156_MSB-00156-01-05 BRCA 1
1,CMB-BRCA_MSB-00587_MSB-00587-01-01 BRCA 1
2,CMB-BRCA_MSB-01798_MSB-01798-02-02 BRCA 1
3,CMB-BRCA_MSB-02137_MSB-02137-01-02 BRCA 1
4,CMB-BRCA_MSB-02664_MSB-02664-01-05 BRCA 1
...,...
126,CPTAC-UCEC_C3N-01537_C3N-01537-22 UCEC 9
127,CPTAC-UCEC_C3N-02027_C3N-02027-23 UCEC 9
128,CPTAC-UCEC_C3N-02253_C3N-02253-27 UCEC 9
129,CPTAC-UCEC_C3N-02595_C3N-02595-25 UCEC 9


### GTEX

In [3]:
# Full path to the output csv file
csv_path = r'F:\Data\GTEX\meta.csv'
# Full path to the .svs files, will be used in glob
glob_dir = r'F:\Data\GTEX'
glob_pattern = r'*\*.svs'
svs_paths = list(Path(glob_dir).glob(glob_pattern))
svs_paths[0:10]

[WindowsPath('F:/Data/GTEX/Bladder/GTEX-N7MS-2126.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-OOBJ-1926.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-OXRK-1926.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-PW2O-1026.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-PWN1-1926.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-QLQW-0826.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-QV31-0926.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-S4UY-0926.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-S7SE-2326.svs'),
 WindowsPath('F:/Data/GTEX/Bladder/GTEX-S95S-0626.svs')]

In [9]:
# this is TCGA abbreviation
TCGA_dict = {
    'BLCA': 0,  # Bladder
    'BRCA': 1,  # Breast
    'CESC': 2,  # Cervix
    'COAD': 3,  # Colon
    'LUAD': 4,  # Lung
    'LUSC': 5,  # none
    'PRAD': 6,  # Prostate
    'SKCM': 7,  # Skin
    'STAD': 8,  # Stomach
    'UCEC': 9   # Uterus
}

# converter function >> converts file name to labels
def converter_GTEX_parent(parent_name):
    parent_name = parent_name.lower()
    match parent_name:
        case _ if 'bladder' in parent_name:
            output = 'BLCA'
        case _ if 'breast' in parent_name:
            output = 'BRCA'
        case _ if 'cervix' in parent_name:
            output = 'CESC'
        case _ if 'colon' in parent_name:
            output = 'COAD'
        case _ if 'lung' in parent_name:
            output = 'LUAD'
        case _ if 'prostate' in parent_name:
            output = 'PRAD'
        case _ if 'skin' in parent_name:
            output = 'SKCM'
        case _ if 'stomach' in parent_name:
            output = 'STAD'
        case _ if 'uterus' in parent_name:
            output = 'UCEC'
        case _:
            print(f"{parent_name} is not properly translated, appending NULL.")
            output = 'NULL'
    return output

# function to create pd.df from a list of Path objects
def df_paths_GTEX(paths_list: [Path]):
    file_list = []
    labels = []
    indices = []

    for svs in paths_list:
        filename = svs.stem
        parent = svs.parent.stem
        label = converter_GTEX_parent(parent)
        index = TCGA_dict[label]

        file_list.append(filename)
        labels.append(label)
        indices.append(index)

    df = pd.DataFrame({'sample': file_list,
         'label': labels,
         'index': indices},
    )

    return df


In [10]:
# this block will create the csv file.
class_df = df_paths_GTEX(svs_paths)
Path(csv_path).parent.mkdir(parents=True, exist_ok=True)
class_df.to_csv(csv_path, index=False, header=False, sep=' ')

# show head of df
class_df.head(10)

Unnamed: 0,sample,label,index
0,GTEX-N7MS-2126,BLCA,0
1,GTEX-OOBJ-1926,BLCA,0
2,GTEX-OXRK-1926,BLCA,0
3,GTEX-PW2O-1026,BLCA,0
4,GTEX-PWN1-1926,BLCA,0
5,GTEX-QLQW-0826,BLCA,0
6,GTEX-QV31-0926,BLCA,0
7,GTEX-S4UY-0926,BLCA,0
8,GTEX-S7SE-2326,BLCA,0
9,GTEX-S95S-0626,BLCA,0
