In [18]:
import os
import re
import sys
from pathlib import Path

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import skimage
from skimage import io
from sklearn import preprocessing
from tqdm.notebook import tqdm, trange
import anndata as ad
import cv2
# import scanorama
from sklearn.model_selection import train_test_split

In [19]:
# Import spatial omics library
#import athena as ath
from spatialOmics import SpatialOmics

# import default graph builder parameters
#from athena.graph_builder.constants import GRAPH_BUILDER_DEFAULT_PARAMS

In [20]:
# Check pytorch
import torch

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))


True
0
<torch.cuda.device object at 0x0000017CEEDCBE50>
1
NVIDIA RTX A2000 12GB


In [21]:
import torch_geometric.utils

p_dir = 'Y:\\coskun-lab\\Thomas\\11_snowflakes'
#spatial_omics_folder = (Path().cwd().parents[0]).absolute() / 'data' / 'spatial_omics_graph_external'
spatial_omics_folder = p_dir + '\\data' + '\\spatial_omics_graph_external'
#process_path = (Path().cwd().parents[0]).absolute() / 'data' / 'torch_graph_data'
process_path = p_dir + '\\data' + '\\torch_graph_data'

In [22]:
from torch_geometric.transforms import LocalCartesian, Cartesian, Polar

# Breast Cancer

In [23]:
### Read clinical data
path = r'Y:\coskun-lab\Thomas\graph\Breast IMC 2\clinical data\41586_2019_1007_MOESM7_ESM.txt'
df_info = pd.read_csv(path , sep="\t")

In [33]:
df_info.columns

Index(['METABRIC.ID', 'MATCHED.NORMAL.METABRIC.ID', 'Cohort',
       'Age.At.Diagnosis', 'Breast.Tumour.Laterality', 'Date.Of.Diagnosis',
       'Last.Followup.Status', 'NPI', 'ER.Status', 'Inferred.Menopausal.State',
       'Lymph.Nodes.Positive', 'Breast.Surgery', 'CT', 'HT', 'RT', 'Grade',
       'Size', 'Histological.Type', 'Stage', 'DeathBreast', 'Death', 'T',
       'TLR', 'LR', 'TDR', 'DR', 'Complete.Rec.History'],
      dtype='object')

In [25]:
# Generate spatial omics dataframe
data_name = 'Breast_IMC_2'
spatial_path = spatial_omics_folder + '\\' + f'{data_name}.hdf5'
spadata = SpatialOmics.from_h5py(spatial_path)



KeyboardInterrupt



In [34]:
spadata.obs['2']

Unnamed: 0,metabricId,core_id,ImageNumber,ObjectNumber,Location_Center_X,Location_Center_Y,SOM_nodes,pg_cluster,description,meta_id,y,x
1,MB-0433,1,2,1,138.555560,2.388889,138,31,HR+ CK7-,6,2.388889,138.555556
2,MB-0433,1,2,2,42.799999,2.400000,5,22,T cells,20,2.400000,42.800000
3,MB-0433,1,2,3,145.937500,4.250000,124,28,HRlow CKlow,13,4.250000,145.937500
4,MB-0433,1,2,4,109.352940,2.970588,218,48,HR+ CK7-,6,2.970588,109.352941
5,MB-0433,1,2,5,131.217390,5.695652,154,31,HR+ CK7-,6,5.695652,131.217391
...,...,...,...,...,...,...,...,...,...,...,...,...
862,MB-0433,1,2,862,377.626010,556.024410,99,54,HR- CK7-,10,556.024390,377.626016
863,MB-0433,1,2,863,127.469390,556.285710,115,54,HR- CK7-,10,556.285714,127.469388
864,MB-0433,1,2,864,179.000000,556.666690,100,54,HR- CK7-,10,556.666667,179.000000
865,MB-0433,1,2,865,322.128570,557.228580,97,54,HR- CK7-,10,557.228571,322.128571


In [12]:
identifier = 'ImageNumber'

status = 'ER.Status'
grade = 'Grade'
stage = 'Stage'

In [13]:
er_dict = {'pos':1, 'neg':0}

In [11]:
# Create csv for all graph info
save_path = process_path / 'Breast' 
pt_path = save_path / 'pt'
pt_path.mkdir(parents=True, exist_ok=True)

# Create csv for all graph info
csv_path = save_path / 'info.csv'

data_names = []
foll_ids = []
paths = []

# Loop
for id in spadata.spl[identifier]:
    l = len(spadata.X[str(id)])
    meta_id = spadata.obs[str(id)].iloc[0].metabricId
    df_id = df_info[df_info['METABRIC.ID'] == spadata.obs[str(id)].iloc[0].metabricId]
    try:
        er = df_id[status].item()
        er_label = er_dict[er]
    except:continue

    # Generate torch data
    G = spadata.G[str(id)]['contact']
    pos = spadata.obs[str(id)][['x','y']].loc[np.array(G.nodes())].to_numpy()
    classes = spadata.obs[str(id)].description.to_numpy()

    df_int = spadata.X[str(id)]
    X = df_int.loc[np.array(G.nodes)].values
    
    data = torch_geometric.utils.from_networkx(spadata.G[str(id)]['contact'])
    data.x = torch.tensor(X).float()
    data.pos = torch.tensor(pos).float()
    data.node_types = classes

    data.label = torch.tensor(er_label)
    data.stage = torch.tensor(df_id[stage].item())
    data.grade = torch.tensor(df_id[grade].item())
    
    # Polar transform to be added to the data
    pos_transform = Polar()
    data  = pos_transform(data)
    
     # Save data and info
    path = save_path / 'pt' / f'{meta_id}_{id}.pt'
    torch.save(data, path)
    data_names.append(meta_id)
    foll_ids.append(id)
    paths.append(path)
df_info = pd.DataFrame({'Dataset': data_names, 'Foll': foll_ids, 'Path': paths,})
df_info.to_csv(csv_path)

In [12]:
df_info['fileName'] = df_info['Path'].map(lambda x: str(x).split('\\')[-1])

In [13]:
df_info.to_csv(csv_path)

# Lung

In [30]:
### Read clinical data
path = r'Y:\coskun-lab\Thomas\graph\Covid IMC 2\41586_2021_3475_MOESM3_ESM.xlsx'
df_info = pd.read_excel(path)

In [31]:
df_info.head()

Unnamed: 0,Sample ID,Disease,Disease subdivision,age,gender,race,smoker,race:Black,race:Hispanic,race:White,...,PLTpermL,Ddimer_mgperL,Ddimer_mgperL_max,WBC,Lypct,PMNpct,ESR,CRP,Procalcitonin,IL6
0,NL6699,Healthy,Healthy,75.0,Male,White,Former,False,False,True,...,,,,,,,,,,
1,NL1933A,Healthy,Healthy,74.0,Female,Hispanic,No,False,True,False,...,,,,,,,,,,
2,NL1915A,Healthy,Healthy,66.0,Male,Black,No,True,False,False,...,,,,,,,,,,
3,NL_ARCHOI114,Healthy,Healthy,,,,,False,False,False,...,,,,,,,,,,
4,FLU_205,FLU,Flu,52.0,Female,Hispanic,No,False,True,False,...,,,,,,,,,,


In [32]:
df_info.Disease.unique()

array(['Healthy', 'FLU', 'ARDS', 'COVID19'], dtype=object)

In [33]:
# Generate spatial omics dataframe
data_name = 'Covid_IMC_2'
spatial_path = spatial_omics_folder / f'{data_name}.hdf5'
spadata = SpatialOmics.from_h5py(spatial_path)

In [52]:
identifier = 'roi'

status = 'Disease'

In [53]:
er_dict = {'Healthy':0, 'FLU':1, 'ARDS':2, 'COVID19':3}

In [55]:
spadata.obs[str(id)]

Unnamed: 0,roi,sample,disease,phenotypes,acquisition_id,acquisition_date,obj_id,cluster_1.0,cluster_label,metacluster_label,meta_id,y,x
2,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,2,32,"32 - Proliferating cells (Ki67+, MPOdim, Histo...",Proliferating cells,15,6.714286,701.000000
3,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,3,6,06 - Fibroblasts (CollagenTypeI+),Fibroblasts,8,7.472727,121.363636
5,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,5,21,"21 - Fibroblasts (CollagenTypeI+, CD56+, pSTAT...",Fibroblasts,8,7.310345,637.068966
6,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,6,5,05 - Endothelial cells (CD31+),Endothelial cells,6,6.702703,185.364865
7,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,7,21,"21 - Fibroblasts (CollagenTypeI+, CD56+, pSTAT...",Fibroblasts,8,6.870968,196.903226
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,1839,5,05 - Endothelial cells (CD31+),Endothelial cells,6,989.866667,569.733333
1841,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,1841,11,"11 - CD4 T-cells (CD3+, CD4+)",CD4 T-cells,1,990.623188,986.847826
1844,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,1844,11,"11 - CD4 T-cells (CD3+, CD4+)",CD4 T-cells,1,992.269841,768.396825
1845,20200609_ARDS_1921-01,20200609_ARDS_1921,ARDS,ARDS,ARDS_1921,7.305364,1845,5,05 - Endothelial cells (CD31+),Endothelial cells,6,993.096774,7.290323


In [56]:
# Create csv for all graph info
save_path = process_path / 'Lung_Covid' 
pt_path = save_path / 'pt'
pt_path.mkdir(parents=True, exist_ok=True)

# Create csv for all graph info
csv_path = save_path / 'info.csv'

data_names = []
foll_ids = []
paths = []

# Loop
for id in spadata.spl[identifier]:
    l = len(spadata.X[str(id)])
    meta_id = spadata.obs[str(id)].iloc[0]['disease']
    try:
        er = spadata.obs[str(id)].iloc[0]['disease']
        er_label = er_dict[er]
    except:continue

    # Generate torch data
    G = spadata.G[str(id)]['knn']
    pos = spadata.obs[str(id)][['x','y']].loc[np.array(G.nodes())].to_numpy()
    classes = spadata.obs[str(id)].metacluster_label.to_numpy()

    df_int = spadata.X[str(id)].iloc[:, 1:-7]
    X = df_int.loc[np.array(G.nodes)].values
    
    data = torch_geometric.utils.from_networkx(spadata.G[str(id)]['knn'])
    data.x = torch.tensor(X).float()
    data.pos = torch.tensor(pos).float()
    data.node_types = classes

    data.label = torch.tensor(er_label)
    
    # Polar transform to be added to the data
    pos_transform = Polar()
    data  = pos_transform(data)
    
     # Save data and info
    path = save_path / 'pt' / f'{meta_id}_{id}.pt'
    torch.save(data, path)
    data_names.append(meta_id)
    foll_ids.append(id)
    paths.append(path)
df_info = pd.DataFrame({'Dataset': data_names, 'Foll': foll_ids, 'Path': paths,})
df_info.to_csv(csv_path)

In [57]:
df_info['fileName'] = df_info['Path'].map(lambda x: str(x).split('\\')[-1])

In [58]:
df_info.to_csv(csv_path)