In [1]:
id_ = 'core'
backup_dir = '/root/datos/maestria/netopaas/luca/data/atlas/'
ikarus_dir = '/root/datos/maestria/netopaas/ikarus'

In [2]:
import gdown
import anndata as ad
import pandas as pd
import os

import scanpy as sc
from ikarus import classifier, utils, data

ikarus is a stepwise machine learning pipeline that tries to cope with a task of distinguishing tumor cells from normal cells. Leveraging multiple annotated single cell datasets it can be used to define a gene set specific to tumor cells. First, the latter gene set is used to rank cells and then to train a logistic classifier for the robust classification of tumor and normal cells. Finally, sensitivity is increased by propagating the cell labels based on a custom cell-cell network. 

In [3]:
url = "https://raw.githubusercontent.com/BIMSBbioinfo/ikarus/master/tutorials/out/signatures.gmt"
signatures_path = f"{ikarus_dir}/signatures.gmt"
gdown.download(url, signatures_path, quiet=False) if not os.path.exists(signatures_path) else None
print(pd.read_csv(signatures_path, sep="\t", header=None))

model_path = f"{ikarus_dir}/core_model.joblib"

     0       1               2             3          4       5       6     \
0  Normal  ikarus    RP11-128M1.1       TRAV8-2  PTPRD-AS1   MEOX2  CXCL12   
1   Tumor  ikarus  RP11-277P12.10  RP13-895J2.6       BNC1  MAGEA6     ISX   

          7              8       9     ...      1305     1306 1307    1308  \
0  KLRC4-KLRK1          BCAS1  SCNN1A  ...  C22ORF15  CYP4F11  AK8  LRRC18   
1       MAGEA3  RP13-614K11.2    CDH7  ...       NaN      NaN  NaN     NaN   

   1309     1310    1311   1312   1313     1314  
0  LMO2  COL12A1  ITGA11  EGFL6  RGS11  PCDHB15  
1   NaN      NaN     NaN    NaN    NaN      NaN  

[2 rows x 1315 columns]


In [4]:
model = classifier.Ikarus(signatures_gmt=signatures_path, out_dir="out",
                          adapt_signatures=True # This is important, we are working with a reduced gen set, 
# so the model won't work if the intesrection betwen the siganture and the avialbe genes is too small..
# that is why the non-overlapping genes mus be removed from the signate. This flag does that automatically
                         )
model.load_core_model(model_path)

In [21]:
adata = sc.read_h5ad(f'{backup_dir}/{id_}.h5ad')
adata

AnnData object with n_obs × n_vars = 892296 × 17811
    obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'ann_fine', 'cell_type_predicted', 'doublet_status', 'leiden', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'ann_coarse', 'cell_type_tumor', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'cell_type_major', 'suspension_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'is_highly_variable', 'mito', 'n_cells_by_counts', 'mean_cou

In [22]:
adata.obs.origin

001C_AAACCTGCATCGGGTC-0           normal
001C_AAACCTGTCAACACCA-0           normal
001C_AAACGGGAGACTAAGT-0           normal
001C_AAACGGGAGGCTCATT-0           normal
001C_AAACGGGAGGGAACGG-0           normal
                               ...      
bcBOGU_25-21               tumor_primary
bcHDOT_25-21               tumor_primary
bcIFTF_25-21               tumor_primary
bcBDLT_25-21               tumor_primary
bcBNMZ_25-21               tumor_primary
Name: origin, Length: 892296, dtype: category
Categories (5, object): ['nan', 'normal', 'normal_adjacent', 'tumor_metastasis', 'tumor_primary']

In [6]:
adata_tumor = adata[adata.obs.origin == 'tumor_primary']

In [7]:
adata_tumor.var.index = adata_tumor.var.feature_name
# The predict function works with de gene_symbol column so assign it
adata_tumor.var['gene_symbol'] = adata_tumor.var.index

  adata_tumor.var['gene_symbol'] = adata_tumor.var.index


In [11]:
from scipy.sparse import save_npz, load_npz

conn_path = 'tumor.npz'
# save_npz(conn_path, adata_tumor.obsp['connectivities'])

In [8]:
del adata

In [10]:
# adata = data.preprocess_adata(adata)

In [11]:
adata_tumor.obs.origin

AAACCCAAGAGCCATG-1_0-1    tumor_primary
AAACCCAAGATTAGAC-1_0-1    tumor_primary
AAACCCAAGGTCGCCT-1_0-1    tumor_primary
AAACCCAAGTCATTGC-1_0-1    tumor_primary
AAACCCACACGGATCC-1_0-1    tumor_primary
                              ...      
bcBOGU_25-21              tumor_primary
bcHDOT_25-21              tumor_primary
bcIFTF_25-21              tumor_primary
bcBDLT_25-21              tumor_primary
bcBNMZ_25-21              tumor_primary
Name: origin, Length: 378360, dtype: category
Categories (1, object): ['tumor_primary']

In [9]:
from pympler import asizeof
asizeof.asizeof(adata_tumor.obsp['connectivities'])/(1024**2)


43.550621032714844

In [12]:
# With around 800mil cells it needs ~ 2 TB, maybe if we use the connectivities from scArches
# We edited the package to use sparse dot product from scipy
_ = model.predict(adata_tumor, "tumor",
                  connectivities_path=conn_path, # If we use the connectivites from scVI the number of predicted cells changes a lot
                  # save=True
                 )

Less than 80% of signature genes are available in data set. A temporary signature is stored where non-overlapping genes are removed. It is proceeded with the temporary signature.


In [13]:
preds = model.results["final_pred"].values
preds

array(['Normal', 'Tumor', 'Normal', ..., 'Normal', 'Normal', 'Normal'],
      dtype=object)

In [20]:
preds_df = pd.DataFrame({'final_pred':preds})
preds_df.index = adata_tumor.obs.index
preds_df.to_csv(f'{ikarus_dir}/{id_}_core.csv')

In [19]:

(preds == 'Tumor').sum()/preds.shape

array([0.17421239])