In [1]:
import sys
import os
import random
import numpy as np
import pandas as pd
import requests
import pickle

sys.path.insert(0, '../')

from pathlib import Path

# TensorFlow imports
import tensorflow as tf
import tensorflow_probability as tfp

%matplotlib inline

In [2]:
# Select strata
TUMOR_TYPES = [
    #"BLCA",
    "BRCA",
    #"CESC",
    "COAD",
    #"DLBC",
    "GBM",
    #"HNSC",
    "KICH",
    "KIRC",
    "KIRP",
    #"LAML",
    "LGG",
    #"LIHC",
    #"LUAD",
    #"LUSC",
    #"OV",
    #"PRAD",
    "READ",
    #"SKCM",
    #"STAD",
    #"THCA",
    #"UCEC",
]

In [3]:
# %% Specify relative paths.
FILE_DIR = './'
DATA_DIR = os.path.join(FILE_DIR, "SourceData")
RSUBREAD_FOLDER = os.path.join(FILE_DIR, "SourceData", "rsubread")

if not os.path.exists(RSUBREAD_FOLDER):
    os.makedirs(RSUBREAD_FOLDER)
    
# Specify paths.
clinical_variables_path = os.path.join(RSUBREAD_FOLDER, "clinical_variables.txt.gz")
cancer_type_path = os.path.join(RSUBREAD_FOLDER, "cancer_types.txt.gz")
rsubread_gene_counts_path = os.path.join(RSUBREAD_FOLDER, "gene_counts.txt.gz")

In [4]:
# %% Download data.
print("____ Downloading data ____ \n")

# Clinical Variables

if not os.path.exists(clinical_variables_path):
    print("Started Download of Clinical Variables...")
    clinical_variables_url = r"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl/GSE62944%5F06%5F01%5F15%5FTCGA%5F24%5F548%5FClinical%5FVariables%5F9264%5FSamples%2Etxt%2Egz"

    r = requests.get(clinical_variables_url)
    with open(clinical_variables_path, "wb") as f:
        f.write(r.content)

    print("Done.")
else:
    print("Raw data exists. Skipping Download.")
    
    # Cancer types.
if not os.path.exists(cancer_type_path):
    print("Started Download of Cancer Types...")
    cancer_type_url = r"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl/GSE62944%5F06%5F01%5F15%5FTCGA%5F24%5FCancerType%5FSamples%2Etxt%2Egz"

    r = requests.get(cancer_type_url)
    with open(cancer_type_path, "wb") as f:
        f.write(r.content)

    print("Done.")

# Gene counts.
if not os.path.exists(rsubread_gene_counts_path):
    print("Started Download of Gene Counts...")
    rsubread_gene_counts_url = r"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1536nnn/GSM1536837/suppl/GSM1536837%5F06%5F01%5F15%5FTCGA%5F24%2Etumor%5FRsubread%5FFeatureCounts%2Etxt%2Egz"

    r = requests.get(rsubread_gene_counts_url)
    with open(rsubread_gene_counts_path, "wb") as f:
        f.write(r.content)

    print("Done.")

____ Downloading data ____ 

Started Download of Clinical Variables...
Done.
Started Download of Cancer Types...
Done.
Started Download of Gene Counts...


KeyboardInterrupt: 

In [5]:
# %% Open downloaded data.

print("Opening downloaded data...")

clinical_variables = pd.read_csv(clinical_variables_path, sep="\t", compression="gzip", low_memory=False)
cancer_types = pd.read_csv(
    cancer_type_path,
    sep="\t",
    header=0,
    names=["patient_id", "tumor_type"],
    compression="gzip",
)
gene_counts = pd.read_csv(rsubread_gene_counts_path, sep="\t", compression="gzip")


Opening downloaded data...


In [None]:
# %% Clinical Variables
clinical_variables = clinical_variables.drop(columns=["Unnamed: 1", "Unnamed: 2"])
clinical_variables.set_index("Unnamed: 0", inplace=True)
clinical_variables = clinical_variables.loc[
    ["vital_status", "last_contact_days_to", "death_days_to"], :
]
clinical_variables = clinical_variables.T
clinical_variables = clinical_variables.dropna(subset=["vital_status"])
clinical_variables = clinical_variables.dropna(
    subset=["last_contact_days_to", "death_days_to"]
)
clinical_variables = clinical_variables.loc[ clinical_variables.vital_status != "[Not Available]", :]

clinical_variables["time"] = -1
mask = clinical_variables.vital_status == "Dead"
clinical_variables.time.loc[mask] = clinical_variables.death_days_to.loc[mask]

mask = clinical_variables.vital_status == "Alive"
clinical_variables.time.loc[mask] = clinical_variables.last_contact_days_to.loc[
    mask
]

# Drop all not usable data points.
mask = (
    (clinical_variables.time != "[Not Available]")
    & (clinical_variables.time != "[Discrepancy]")
    & (clinical_variables.time != "[Completed]")
)
clinical_variables = clinical_variables.loc[mask]

# Drop non-positive survival times.
clinical_variables.time = pd.to_numeric(clinical_variables.time)
clinical_variables = clinical_variables.loc[clinical_variables.time > 0]

# Set event indicator. Person died := event == True
clinical_variables["event"] = -1
clinical_variables.event[clinical_variables.vital_status == "Dead"] = True
clinical_variables.event[clinical_variables.vital_status == "Alive"] = False

clinical_variables = clinical_variables.loc[:, ["time", "event"]]
clinical_variables.reset_index(inplace=True)
clinical_variables.rename(columns={"index": "patient_id"}, inplace=True)

print("Done.")
# Merge with cancer types.
print("Merging with cancer types.")

patients = pd.merge(cancer_types, clinical_variables, on=["patient_id"])

print("Done.")

# Merge with gene Counts.
print("Merging with gene counts.")

gene_counts.set_index("Unnamed: 0", inplace=True)
gene_counts = gene_counts.T
gene_counts.reset_index(inplace=True)
gene_counts.rename(columns={"index": "patient_id"}, inplace=True)

print("Done.")
# Data frame with all possible tumor types.
print("Merging all together.")

full_data = pd.merge(patients, gene_counts, on=["patient_id"])
print("Done.")

print("Saving merged data...")
full_data.to_pickle(os.path.join(RSUBREAD_FOLDER, "complete_data_merged.pickle"))

In [None]:
def Stacking(DataSet):
    
    StackedTotal = DataSet.stack().reset_index()
    StackedTotal = StackedTotal.rename(columns={0:'GeneCount'})
    return  StackedTotal

In [None]:
TotalData = full_data

In [None]:
TUMOR_TYPE_COMBINATION = sorted([  "COLO", "BRCA", "GLIOMA", "KIPAN"]) # "COLO", "BRCA", "GLIOMA", "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "GBM", 'tumor_type' ] = "GLIOMA"
TotalData.loc[ TotalData['tumor_type'] == "LGG", 'tumor_type' ] = "GLIOMA"
TotalData.loc[ TotalData['tumor_type'] == "KIRP", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "KICH", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "KIRC", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "COAD", 'tumor_type' ] = "COLO"
TotalData.loc[ TotalData['tumor_type'] == "READ", 'tumor_type' ] = "COLO"

TotalData = TotalData[TotalData['tumor_type'].isin(TUMOR_TYPE_COMBINATION)].copy()


## Varialbes which have small values should be removed
RemCheckVar = TotalData.iloc[:, 4:].var()
RemCheckSum = TotalData.iloc[:, 4:].sum()

TotalData_copy = TotalData.copy()

RemList = RemCheckSum[RemCheckSum < RemCheckSum.quantile(0.5)].index.to_list()
TotalData = TotalData.drop(columns=RemList)

RemList1 = RemCheckSum[RemCheckSum < RemCheckSum.quantile(0.995)].index.to_list()
TotalData1 = TotalData_copy.drop(columns=RemList1)


In [None]:
LogTotalData = pd.DataFrame(np.log2(TotalData.iloc[:, 4:].values + 1), columns=TotalData.iloc[:, 4:].columns)
LogTotalData = pd.concat([TotalData.iloc[:, :4].reset_index(drop=True), LogTotalData], axis=1)

In [None]:
TotalData = pd.DataFrame()

for Type in LogTotalData['tumor_type'].unique():
    #print(Type)
    DataSub = LogTotalData[LogTotalData['tumor_type'] == Type].copy()
    
    SubMin = np.min(DataSub.iloc[:, 4:].values)
    SubMax = np.max(DataSub.iloc[:, 4:].values)
    #SubMin = DataSub.iloc[:, 4:].min()
    #SubMax = DataSub.iloc[:, 4:].max()
    
    ## Normalization
    DataSub.iloc[:, 4:] = (DataSub.iloc[:, 4:] -SubMin) / (SubMax - SubMin)
    TotalData = TotalData.append(DataSub)
    #print(DataSub)

TotalData = TotalData.sample(frac=1).reset_index(drop=True)
#TotalData = TotalData.reset_index(drop=True)
gene_counts = TotalData.iloc[:, 4:] 
LogAnalData = pd.merge(TotalData[['patient_id','tumor_type','time','event']],LogTotalData, on=['patient_id','tumor_type','time','event'] ) 

In [12]:
# TTE selection and generating the distance matrix
#TTE = np.log(TotalData['time'].values.astype('float32'))
TTE = TotalData['time'].values.astype('float32')
TTEXY  = np.matmul(TTE[:, None], TTE[None])
DisimInd = TTE[:, None]**2 + TTE[ None]**2 - 2 *TTEXY
DisimInd = np.sqrt(np.maximum(DisimInd, 1e-7))

Event = TotalData['event'].values.astype('int32')
GeneCount = gene_counts.reset_index(drop=True)
GeneCount.index = GeneCount.index+1


# generating gene to int map
GeneList = GeneCount.columns.to_list()
GeneToInt = { i : num+1 for num, i in enumerate(GeneList)}

IntToGene= { i+1 : GeneList[i] for i in range(0, len(GeneList)) }
GeneCount.columns = GeneToInt.values()

StakedgData = Stacking(GeneCount)


In [16]:
folder_name = "processedData"

if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [17]:
TotalData1.to_pickle('./processedData/TotalData1.pkl')
np.save('./ProcessedData/GeneCount.npy',GeneCount)
np.save('./ProcessedData/TTE.npy',TTE)
np.save('./ProcessedData/EVENT.npy',Event)