In [9]:
filters_path = '../metadata/dsets.csv'
dataset_str = 'Zuani_2024_NSCLC'
adatas_path = '/root/datos/maestria/netopaas/luca/data/'


data_dir = '/root/datos/maestria/netopaas/lung_scRNA'
backup_dir = '/root/datos/maestria/netopaas/luca_explore'
ref_model_path = f'{data_dir}/HCA_Lung/HLCA_reference_model'

In [88]:
import gdown
import gzip
import shutil

import pandas as pd
import numpy as np
import seaborn as sns
from scipy import sparse

import anndata as ad
import scanpy as sc

sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))

import scarches as sca

import sys, os
sys.path.append(os.path.join(os.getcwd(), '../utils'))
from functions import remove_repeated_var_inds, join_map_mart

In [11]:
filters = pd.read_csv(filters_path)
id_row = filters[filters.id == dataset_str]
# file_str = '/'.join(id_row.input_adata.iloc[0].split('/')[1:])
file_str = id_row.input_adata.iloc[0]

thresholds = {}
thresholds['min_counts']  = int(id_row.min_counts)
thresholds['max_counts'] = int(id_row.max_counts)
thresholds['min_genes'] = int(id_row.min_genes)
thresholds['max_genes'] = int(id_row.max_genes)
thresholds['max_pct_mito'] = int(id_row.max_pct_mito)

adata = ad.read_h5ad(file_str)

  utils.warn_names_duplicates("obs")


In [None]:
#papermill_description=FILTERS

# very basic gene filtering - genes with 0 cells cause some downstream processes to fail.
print("Filtering genes")
print(f"    Before: {adata.shape[1]}")
sc.pp.filter_genes(adata, min_counts=2)
print(f"    After: {adata.shape[1]}")

# %%
# Apply thresholds
print("Filter by min_counts")
print(f"    Before: {adata.shape[0]}")
sc.pp.filter_cells(adata, min_counts=thresholds["min_counts"])
print(f"    After: {adata.shape[0]}")


print("Filter by max_counts")
print(f"    Before: {adata.shape[0]}")
sc.pp.filter_cells(adata, max_counts=thresholds["max_counts"])
print(f"    After: {adata.shape[0]}")


print("Filter by min_genes")
print(f"    Before: {adata.shape[0]}")
sc.pp.filter_cells(adata, min_genes=thresholds["min_genes"])
print(f"    After: {adata.shape[0]}")


print("Filter by max_genes")
print(f"    Before: {adata.shape[0]}")
sc.pp.filter_cells(adata, max_genes=thresholds["max_genes"])
print(f"    After: {adata.shape[0]}")

# %%
if "mito" not in adata.var.columns:
    adata.var["mito"] = adata.var_names.str.lower().str.startswith("mt-")

# %%
sc.pp.calculate_qc_metrics(
    adata, qc_vars=("mito",), log1p=False, inplace=True, percent_top=None
)

print("Filter by max_pct_mito")
print(f"    Before: {adata.shape[0]}")
adata = adata[adata.obs["pct_counts_mito"] < thresholds["max_pct_mito"]].copy()
print(f"    After: {adata.shape[0]}")

Filtering genes
    Before: 33538


  utils.warn_names_duplicates("obs")


    After: 29955
Filter by min_counts
    Before: 312502


  utils.warn_names_duplicates("obs")


    After: 270340
Filter by max_counts
    Before: 270340


In [None]:
adata.write_h5ad(f'{backup_dir}/surgeries/filtered_{dataset_str}.h5ad')

In [None]:
p1 = sns.displot(adata.obs["total_counts"], bins=100, kde=False)

In [1]:
import torch
torch.cuda.is_available()

False

In [18]:
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as robjects

# Set the graphics device
robjects.r('options(bitmapType="cairo")')
import anndata2ri

# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [16]:
gene_names = list(adata.var.index)

['MIR1302-2HG',
 'AL627309.1',
 'AL627309.3',
 'AL627309.4',
 'AL732372.1',
 'AC114498.1',
 'AL669831.2',
 'AL669831.5',
 'FAM87B',
 'LINC00115',
 'FAM41C',
 'AL645608.7',
 'AL645608.3',
 'AL645608.5',
 'AL645608.1',
 'SAMD11',
 'NOC2L',
 'KLHL17',
 'PLEKHN1',
 'PERM1',
 'AL645608.8',
 'HES4',
 'ISG15',
 'AL645608.2',
 'AGRN',
 'AL645608.9',
 'RNF223',
 'C1orf159',
 'LINC01342',
 'AL390719.2',
 'TTLL10-AS1',
 'TTLL10',
 'TNFRSF18',
 'TNFRSF4',
 'SDF4',
 'B3GALT6',
 'C1QTNF12',
 'AL162741.1',
 'UBE2J2',
 'LINC01786',
 'SCNN1D',
 'ACAP3',
 'PUSL1',
 'INTS11',
 'CPTP',
 'TAS1R3',
 'DVL1',
 'MXRA8',
 'AURKAIP1',
 'CCNL2',
 'MRPL20',
 'AL391244.3',
 'ANKRD65',
 'AL391244.2',
 'TMEM88B',
 'LINC01770',
 'VWA1',
 'ATAD3C',
 'ATAD3B',
 'ATAD3A',
 'TMEM240',
 'SSU72',
 'AL645728.1',
 'FNDC10',
 'AL691432.2',
 'MIB2',
 'MMP23B',
 'CDK11B',
 'FO704657.1',
 'SLC35E2B',
 'CDK11A',
 'SLC35E2A',
 'NADK',
 'GNB1',
 'AL109917.1',
 'CALML6',
 'TMEM52',
 'CFAP74',
 'AL391845.2',
 'GABRD',
 'AL391845.1',
 

In [46]:
%%R
library(biomaRt)
ensembl38 <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl",mirror = "useast"
                        # version='GRCh37'
                       )
# filters = listFilters(ensembl)
# print(filters[1:10,])

In [47]:
%%R -i gene_names -o ens38
# extids <- c("AL627309.1","MIR1302-2HG","ENSG00000243485")
extids <- gene_names
ens38 <- getBM(attributes = c('ensembl_gene_id','external_gene_name'),
      filters = 'external_gene_name',
      values = extids, 
      mart = ensembl38)
# ens37

In [38]:
adata.var['symbol'] = adata.var.index

In [96]:
miau = pd.merge(adata.var.loc[:,['symbol']], ens37, left_on='symbol', right_on='external_gene_name', how='left')
miau = miau.drop_duplicates('symbol')
miau.index = miau.symbol
adata.var['ensembl_gene_id1'] = miau['ensembl_gene_id'].copy()

In [95]:
miau = pd.merge(adata.var.loc[:,['symbol']], ens38, left_on='symbol', right_on='external_gene_name', how='left')
miau = miau.drop_duplicates('symbol')
miau.index = miau.symbol
adata.var['ensembl_gene_id2'] = miau['ensembl_gene_id'].copy()

In [97]:
adata.var['ensembl_gene_id'] = adata.var['ensembl_gene_id2'].combine_first(adata.var['ensembl_gene_id1'])

In [98]:
adata.var['ensembl_gene_id1'].isna().sum()

9515

In [99]:
adata.var['ensembl_gene_id2'].isna().sum()

8990

In [100]:
zuani_ensembl = adata.var.loc[:,['symbol', 'ensembl_gene_id']]
zuani_ensembl.to_csv('/root/datos/maestria/netopaas/zuani_ensembl.csv')

In [87]:
zuani_ensembl

Unnamed: 0,symbol,external_gene_name
MIR1302-2HG,MIR1302-2HG,MIR1302-2HG
AL627309.1,AL627309.1,AL627309.1
AL627309.3,AL627309.3,
AL627309.4,AL627309.4,
AL732372.1,AL732372.1,AL732372.1
...,...,...
AC004556.1,AC004556.1,
AC233755.2,AC233755.2,
AC233755.1,AC233755.1,
AC240274.1,AC240274.1,


In [103]:
adata.var_names

Index(['MIR1302-2HG', 'AL627309.1', 'AL627309.3', 'AL627309.4', 'AL732372.1',
       'AC114498.1', 'AL669831.2', 'AL669831.5', 'FAM87B', 'LINC00115',
       ...
       'AC007325.1', 'AC007325.4', 'AC007325.2', 'AL354822.1', 'AC023491.2',
       'AC004556.1', 'AC233755.2', 'AC233755.1', 'AC240274.1', 'FAM231C'],
      dtype='object', length=29955)

In [114]:
adata.var = adata.var.filter(regex='^(?!external).*')
adata.var = adata.var.filter(regex='^(?!ensembl).*')
adata.var.drop(columns=['symbol'], inplace=True)

In [117]:
adata.var = adata.var.loc[:, ['mito']]

In [119]:
adata.var

Unnamed: 0,mito
MIR1302-2HG,False
AL627309.1,False
AL627309.3,False
AL627309.4,False
AL732372.1,False
...,...
AC004556.1,False
AC233755.2,False
AC233755.1,False
AC240274.1,False


In [118]:
import scarches as sca

ref_path = '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/full_atlas_hvg_integrated_scvi_scanvi_model/'

adata_query = sca.models.SCANVI.prepare_query_anndata(
    adata = adata,
    # return_reference_var_names=True,
    reference_model = ref_path,
    inplace=False)

[34mINFO    [0m File                                                                                                      
         [35m/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/full_atlas_hvg_integrated_scvi_scanvi_model/[0m[95mmodel.pt[0m  
         already downloaded                                                                                        
[34mINFO    [0m Found [1;36m97.43333333333334[0m% reference vars in query data.                                                    


InvalidIndexError: Reindexing only valid with uniquely valued Index objects