# Installation

In [1]:
import numpy as np
import pandas as pd
import anndata as ad

## Read in Clinical Data

In [4]:
clinical = pd.read_csv(r'clinicalData/yale_clinical_data.csv')
clinical = clinical.rename(columns= {'case_id':"patient_id"})

nsclc_predtab        = pd.read_csv(r'clinicalData/yale_results_predictions.csv')
nsclc_probabalities  = pd.read_csv(r'clinicalData/yale_results_probabilities.csv')

In [5]:
len(clinical['patient_id'].unique())

195

In [6]:
set(nsclc_predtab['cohort'])

{404, 471, 496, 523}

In [7]:
nsclc_pred     = nsclc_predtab[nsclc_predtab.cohort.isin([404, 471])].reset_index(drop=True)
nsclc_probs    = nsclc_probabalities[nsclc_probabalities.cohort.isin([404, 471])].reset_index(drop=True)
nsclc_clinical = clinical[clinical.cohort.isin([404, 471])].reset_index(drop=True)

In [8]:
clinical.keys()

Index(['Unnamed: 0', 'slide_id', 'inclusion', 'patient_id', 'file_name',
       'cohort', 'tumor_type', 'pre_post', 'histology_type', 'biopsy_site',
       'BOR', 'OR', 'CB6', 'PORvsFPD', 'pfs.T', 'pfs.E', 'os.T', 'os.E',
       'pfs.6months', 'pfs.12months', 'OS.18months', 'OS.24months'],
      dtype='object')

In [9]:
# Cohorts in the dataset
len(set(nsclc_clinical['patient_id']))

117

In [15]:
# Import cell morphology measures
nsclc_quant  = pd.read_csv(r'clinicalData/yale_nsclc_quantification_tables_6_6_2024_filtered.csv')

renamedict = {d: d.replace('_', '') for d in nsclc_quant.columns}
nsclc_quant = nsclc_quant.rename(columns=renamedict)

In [17]:
# Import patient metadata
nsclc_pfeat = pd.read_csv(r'clinicalData/yale nsclc patient features v05 filtered.csv')

  nsclc_pfeat = pd.read_csv(r'clinicalData/yale nsclc patient features v05 filtered.csv')


In [18]:
# Overall neighborhood features
nsclc_pneigh = pd.read_csv(r'clinicalData/yale nsclc patient neighbor features v05 filtered.csv')

## Change Cell Type Names

In [23]:
renamedict = {d: d.replace('_', '') for d in nsclc_pred.columns}
nsclc_pred = nsclc_pred.rename(columns=renamedict)

renamedict = {d: d.replace('_', '') for d in nsclc_probs.columns}
nsclc_probs = nsclc_probs.rename(columns=renamedict)

renamedict = {d: d.replace('_', '') for d in nsclc_clinical.columns}
nsclc_clinical = nsclc_clinical.rename(columns=renamedict)

## Reassign UUIDs to Match the Measurement Files

In [24]:
nsclc_quant_temp = []

for _, pat in nsclc_quant.groupby(by='slideid'):
    pat['slideid'] = _
    pat['uuid'] = pat['slideid'] + '_' + np.array(list(range(1, len(pat)+1)), dtype=str)
    nsclc_quant_temp.append(pat)

nsclc_quant = pd.concat(nsclc_quant_temp)

## Merging them together

In [25]:
# clean cols
featurecolstokeep = ['patient_id', 'feature_name', 'feature_value', 'n', 'feature_family']

nsclc_pfeat = nsclc_pfeat[featurecolstokeep]
nsclc_pneigh = nsclc_pneigh[featurecolstokeep]

# Rename columns
qmetacols = ['nuclear_area_um2', 'membrane_area_um2', 'nuclear_eccentricity', 'membrane_eccentricity',
             'nuclear_perimeter', 'membrane_perimeter', 'nuclear_solidity', 'membrane_solidity', 
             'nuclear_major_axis_length', 'membrane_major_axis_length', 'nuclear_minor_axis_length', 
             'membrane_minor_axis_length', 'nuclear_feret_diameter_max', 'membrane_feret_diameter_max']
qmetacols = [a.replace('_', '') for a in qmetacols]
qcols =  ['cd45ro_nuclear', 'hlaa_nuclear', 'cd4_nuclear', 'ecadherin_nuclear', 'cd20_nuclear', 'cd68_nuclear', 'cd8_nuclear', 'cd14_nuclear', 'cd11c_nuclear', 'cd44_nuclear', 'vimentin_nuclear', 'cd45_nuclear', 'granzymeb_nuclear', 'cd34_nuclear', 'cd3e_nuclear', 'lag3_nuclear', 'cd31_nuclear', 'pdl1_nuclear', 'icos_nuclear', 'nakatpase_nuclear', 'foxp3_nuclear', 'cd19_nuclear', 'cd163_nuclear', 'pax5_nuclear', 'g6pd_nuclear', 'pd1_nuclear', 'cd21_nuclear', 'sma_nuclear', 'cd11b_nuclear', 'ido1_nuclear', 'pnrf2_nuclear', 'cd57_nuclear', 'col4_nuclear', 'atpa5_nuclear', 'citratesynthase_nuclear', 'asct2_nuclear', 'sdha_nuclear', 'hexokinase1_nuclear', 'idh2_nuclear', 'glut1_nuclear', 'ldha_nuclear', 'ki67_nuclear', 'pancytokeratin_nuclear', 'cpt1a_nuclear', 'cd45ro_membrane', 'hlaa_membrane', 'cd4_membrane', 'ecadherin_membrane', 'cd20_membrane', 'cd68_membrane', 'cd8_membrane', 'cd14_membrane', 'cd11c_membrane', 'cd44_membrane', 'vimentin_membrane', 'cd45_membrane', 'granzymeb_membrane', 'cd34_membrane', 'cd3e_membrane', 'lag3_membrane', 'cd31_membrane', 'pdl1_membrane', 'icos_membrane', 'nakatpase_membrane', 'foxp3_membrane', 'cd19_membrane', 'cd163_membrane', 'pax5_membrane', 'g6pd_membrane', 'pd1_membrane', 'cd21_membrane', 'sma_membrane', 'cd11b_membrane', 'ido1_membrane', 'pnrf2_membrane', 'cd57_membrane', 'col4_membrane', 'atpa5_membrane', 'citratesynthase_membrane', 'asct2_membrane', 'sdha_membrane', 'hexokinase1_membrane', 'idh2_membrane', 'glut1_membrane', 'ldha_membrane', 'ki67_membrane', 'pancytokeratin_membrane', 'cpt1a_membrane']
qcols = [a.replace('_', '') for a in qcols]

#select NSCLC samples
nsclcq = nsclc_quant[qcols]
nsclcqmeta= nsclc_quant[qmetacols]

## Merge on  UUIDs

In [26]:
def removemod(x):
   x = x.replace('nuclear','')
   return x

In [27]:
nucs = (list(nsclcq.filter(like = 'nuclear', axis=1 )))
mems = (list(nsclcq.filter(like = 'membrane', axis=1 )))
col_pairs = list(map(list, zip(nucs, mems)))

# in a list comprehension iterate through each column pair, get the mean, and concat the results into a dataframe
nsclcq_means = pd.concat([nsclcq[pairs].mean(axis=1) for pairs in col_pairs], axis=1)

# in a list comprehension create column header names with a string join 
nsclcq_means.columns = nucs#[' & '.join(pair) for pair in col_pairs]
nsclcq_means.rename(columns=removemod, inplace=True)

In [28]:
combined_nsclc = nsclc_pred.merge(nsclc_quant, left_on='uuid', right_on='uuid')
nsclc_probs.columns = [str(col) + '_prob' if col != 'uuid' else str(col) for col in nsclc_probs.columns ]
combined_nsclc = combined_nsclc.merge(nsclc_probs, left_on='uuid', right_on='uuid')

## Generate Marker Columns

In [29]:
nsclc = pd.DataFrame()
col_ids = ['asct2', 'asct2_high','atpa5', 'atpa5_high', 'cd11b', 'cd11c', 'cd14', 'cd163', 'cd19','cd20', 'cd21', 'cd31', 'cd34', 'cd3e', 'cd4', 'cd44', 'cd44_high',
       'cd45', 'cd45ro', 'cd57', 'cd68', 'cd8', 'citratesynthase', 'citratesynthase_high', 'col4', 'cpt1a', 'cpt1a_high', 'ecadherin','foxp3', 'g6pd', 'g6pd_high', 'glut1', 'glut1_high', 'granzymeb',
       'hexokinase1', 'hexokinase1_high', 'hlaa', 'hlaa_high', 'icos', 'idh2', 'idh2_high', 'ido1', 'ido1_high', 'ki67', 'lag3', 'ldha', 'ldha_high',
       'nakatpase', 'pancytokeratin', 'pax5', 'pd1', 'pdl1', 'pdl1_high', 'pnrf2', 'pnrf2_high', 'sdha', 'sdha_high', 'sma', 'vimentin']
col_ids = [a.replace('_', '') for a in col_ids]
nsclc.X = combined_nsclc[col_ids + [col+'_prob' for col in col_ids]].copy()

nsclc.meta = combined_nsclc[['slideid_x', 'uuid', 'X', 'Y', 'celltypes_x', 'newcelltypes', 'clusterid_x', 'neighborhoods_x', 'arearegion_x']].copy()

nsclc.meta = nsclc.meta.rename(columns= {'X':"x", 'Y':"y", 'slideid_x': 'slideid', 'celltypes_x': 'celltypes', 'clusterid_x': 'clusterid', 'neighborhoods_x': 'neighborhoods', 'arearegion_x': 'arearegion'})
nsclc.meta = nsclc.meta.merge(nsclc_clinical, on='slideid').reset_index(drop=True)

adata = ad.AnnData(nsclcq_means)
adata.obs = nsclc.meta

spatial=pd.DataFrame(adata.obs[['x','y']])
spatial=spatial.to_numpy()
adata.obsm["spatial"]=spatial

adata.uns['nsclc_frac_features'], adata.uns['nsclc_neighbour_features'] = nsclc_pfeat, nsclc_pneigh
adata.obs['Tissue'] = 'NSCLC'
adata.obsm['marker_positivity'] = nsclc.X

adata.obsm['marker_positivity'].index = adata.obsm['marker_positivity'].index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
nsclcadata = adata
nsclcadata

  nsclc.X = combined_nsclc[col_ids + [col+'_prob' for col in col_ids]].copy()
  nsclc.meta = combined_nsclc[['slideid_x', 'uuid', 'X', 'Y', 'celltypes_x', 'newcelltypes', 'clusterid_x', 'neighborhoods_x', 'arearegion_x']].copy()


AnnData object with n_obs × n_vars = 272027 × 44
    obs: 'slideid', 'uuid', 'x', 'y', 'celltypes', 'newcelltypes', 'clusterid', 'neighborhoods', 'arearegion', 'Unnamed: 0', 'inclusion', 'patientid', 'filename', 'cohort', 'tumortype', 'prepost', 'histologytype', 'biopsysite', 'BOR', 'OR', 'CB6', 'PORvsFPD', 'pfs.T', 'pfs.E', 'os.T', 'os.E', 'pfs.6months', 'pfs.12months', 'OS.18months', 'OS.24months', 'Tissue'
    uns: 'nsclc_frac_features', 'nsclc_neighbour_features'
    obsm: 'spatial', 'marker_positivity'

## Filter the Histologies

In [30]:
rmap = {'histologytype': {'Adenocarcinoma' :'Adenocarcinoma',
 'Adenosquamous':'Adenosquamous',
 'Large-cell carcinoma':'LCC',
 'NOS':'NOS',
 'SCLC':'Squamous',
 'Squamous':'Squamous'}}

nsclcadata.obs.replace(rmap, inplace=True)

nsclcadata = nsclcadata[nsclcadata.obs.histologytype.isin(['Adenocarcinoma', 'Adenosquamous', 'Squamous'])]

In [31]:
nsclcadata

View of AnnData object with n_obs × n_vars = 189854 × 44
    obs: 'slideid', 'uuid', 'x', 'y', 'celltypes', 'newcelltypes', 'clusterid', 'neighborhoods', 'arearegion', 'Unnamed: 0', 'inclusion', 'patientid', 'filename', 'cohort', 'tumortype', 'prepost', 'histologytype', 'biopsysite', 'BOR', 'OR', 'CB6', 'PORvsFPD', 'pfs.T', 'pfs.E', 'os.T', 'os.E', 'pfs.6months', 'pfs.12months', 'OS.18months', 'OS.24months', 'Tissue'
    uns: 'nsclc_frac_features', 'nsclc_neighbour_features'
    obsm: 'spatial', 'marker_positivity'

In [34]:
len(nsclcadata.obs['patientid'].unique())

82

## Convert Cell Type Labels

In [35]:
nsclcadata.obs['celltypes'].value_counts()

celltypes
tumor_cells          78655
macrophages          26488
cd4_t_cells          16302
fibroblasts          12382
myofibroblasts       11872
cd8_t_cells           8614
endothilial_cells     6893
immune_nos            5666
other                 5636
granulocytes          4781
cd4_tregs             3661
plasma_cells          3216
b_cells               3130
myeloid_nos           2466
artifact_cells          92
Name: count, dtype: int64

In [36]:
celltypeconversion = {
     'artifact_cells': 'artifactcells',
     'b_cells': 'bcells',
     'cd4_t_cells': 'cd4tcells',
     'cd4_tregs': 'cd4tregcells',
     'cd8_t_cells': 'cd8tcells',
     'endothilial_cells': 'endothilialcells',
     'fibroblasts': 'fibroblastcells',
     'granulocytes': 'granulocytecells',
     'immune_nos': 'immunenoscells',
     'macrophages': 'macrophagecells',
     'myeloid_nos': 'myeloidnoscells',
     'myofibroblasts': 'myofibroblastcells',
     'other': 'othercells',
     'plasma_cells': 'plasmacells',
     'tumor_cells': 'tumorcells'}

nsclcadata.obs.celltypes = nsclcadata.obs.celltypes.map(celltypeconversion)

  self[name] = value


In [37]:
nsclcadata.write(r'AnnDataFiles/1-output_nsclc_ytma_raw_preandpost.h5ad')