In [None]:
import scanpy as sc
import os
import scanpy as sc
import pandas as pd
import pyranges as pr

import seaborn as sns
import matplotlib.pyplot as plt
import polars as pl

In [None]:
## scvi-tools-1

## Read adata

In [None]:
data_path = '/data/ceph/hdd/project/node_08/QNA/scborzoi/submission_data'

In [None]:
adata = sc.read(
    os.path.join(data_path, 'OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad'), backed='r') # sanger matrix

In [None]:
adata.var['gene'] = adata.var.GeneSymbol.str.replace(' ', '')

In [None]:
adata = adata[~adata.obs.cell_label.isin(['Platelets', 'Erythrocytes'])]

## Compute highly variable genes

In [17]:
adata = adata.to_memory()

In [18]:
adata.obs['sample'] = 'pool'+ adata.obs_names.str.split('-').str[1]
adata.obs['barcode_sample'] = adata.obs_names.str.split('-').str[0] + '-1'

In [19]:
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=3000, batch_key='sample')

In [23]:
adata.write(os.path.join(data_path, 'onek1k_training_data', 'OneK1K_only_immune.h5ad'))

## Check which highly variable genes overlap with val/test genes

In [52]:
import pyranges as pr
import numpy as np

In [138]:
adata = sc.read(os.path.join(data_path, 'onek1k_training_data','OneK1K_only_immune.h5ad'))

In [139]:
cellxgene = sc.read(os.path.join(data_path, 'onek1k_cellxgene.h5ad'), backed='r') # cellxgene matrix

In [140]:
missing_ontology_map = {
    'CL:0000895': 'CL:0000624',
    'CL:0000788': 'CL:0000236',
    'CL:0000900': 'CL:0000625',
    'CL:0000818': 'CL:0000236',
    'CL:0000815': 'CL:0002677',
    'CL:0001054': 'CL:0002057',
    'CL:0001065': np.nan,
    'CL:0000990': 'CL:0000451',
    'CL:2000001': np.nan,
    'CL:0000232': np.nan,
}

In [141]:
adata.obs = adata.obs.join(cellxgene.obs['cell_type_ontology_term_id'])

In [142]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].apply(lambda x: missing_ontology_map[x] if x in missing_ontology_map.keys() else x)

In [143]:
friendly_names = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'friendly_names.tsv'), sep='\t')

In [144]:
friendly_names['tissue_ontology_id'] = friendly_names['tissue_ontology_id'].str.replace('_', ':')

In [145]:
tissue_ontology_mapping = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'tissue_ontology_mapping.tsv'), sep='\t').query("study=='OneK1K'")

In [146]:
tissue_ontology_mapping['tissue_ontology_id'] = tissue_ontology_mapping['tissue_ontology_id'].str.replace('_', ':')

In [147]:
test = adata.obs.reset_index().merge(
    friendly_names, 
    left_on='cell_type_ontology_term_id', 
    right_on='tissue_ontology_id', 
    how='left').merge(
    tissue_ontology_mapping[['qtl_group', 'tissue_ontology_id']].drop_duplicates('tissue_ontology_id'), 
    left_on='cell_type_ontology_term_id', 
    right_on='tissue_ontology_id', 
    how='left').set_index('barcode').loc[adata.obs_names]

In [149]:
adata.obs = test

In [153]:
adata.write(os.path.join(data_path, 'onek1k_training_data','OneK1K_only_immune.h5ad'))

## Write out genes

In [11]:
hv_genes = adata.var.index[adata.var['highly_variable'] == 1]

In [39]:
gtf_file = os.path.join(data_path, 'gencode.v32.annotation.sorted.gtf.gz') 
bed_file =  os.path.join(data_path, 'scooby_training_data', 'sequences.bed')
fasta_file = os.path.join(data_path, 'scooby_training_data', 'genome_human.fa')

In [42]:
# gtf file
gtf = pr.read_gtf(gtf_file)

In [43]:
genes = gtf[gtf.Feature=='gene'].df

In [15]:
# read sequences
sequences = pr.read_bed(bed_file)

In [16]:
genes = pr.PyRanges(genes)

In [17]:
# sequences that are not in test or train fold
sequences_val = sequences[sequences.Name.isin(['fold4'])]
sequences_test = sequences[sequences.Name.isin(['fold3'])]
sequences_train = sequences[~sequences.Name.isin(['fold4', 'fold3'])]

In [27]:
val_genes = genes.overlap(sequences_val, invert=False)
test_genes = genes.overlap(sequences_test, invert=False)
train_genes = genes.overlap(sequences_train, invert=False)

In [28]:
print(len(val_genes))
print(len(test_genes))
print(len(train_genes))

6671
6276
44988


In [29]:
# make sure that they really do not overlap with train set
val_genes = val_genes.overlap(sequences_train, invert=True)
# make sure that they really do not overlap with train set and val set
test_genes = test_genes.overlap(sequences_train, invert=True).overlap(sequences_val, invert=True)
# make sure that they really do not overlap with val and test set
train_genes = train_genes.overlap(sequences_test, invert=True).overlap(sequences_val, invert=True)

In [30]:
print(len(val_genes))
print(len(test_genes))
print(len(train_genes))

6663
6261
44969


In [31]:
# drop duplicated gene names (as Alex commanded)
val_genes = val_genes.df.drop_duplicates(subset='gene_id')
test_genes = test_genes.df.drop_duplicates(subset='gene_id')
train_genes = train_genes.df.drop_duplicates(subset='gene_id')

In [32]:
# restrict to chromosomes
val_genes = val_genes[val_genes.Chromosome.str.contains('chr')]
test_genes = test_genes[test_genes.Chromosome.str.contains('chr')]
train_genes = train_genes[train_genes.Chromosome.str.contains('chr')]

In [33]:
print(len(val_genes))
print(len(test_genes))
print(len(train_genes))

6663
6261
44969


In [38]:
val_genes['gene_id'] = val_genes['gene_id'].str.split('.').str[0]
test_genes['gene_id'] = test_genes['gene_id'].str.split('.').str[0]
train_genes['gene_id'] = train_genes['gene_id'].str.split('.').str[0]

In [39]:
# include genes in adata
val_genes = val_genes[val_genes.gene_id.isin(adata.var.index)]
test_genes = test_genes[test_genes.gene_id.isin(adata.var.index)]
train_genes = train_genes[train_genes.gene_id.isin(adata.var.index)]

In [40]:
print(len(val_genes))
print(len(test_genes))
print(len(train_genes))

3539
3129
23783


In [41]:
# include genes in highly variable adata genes
val_genes_hv = val_genes[val_genes.gene_id.isin(hv_genes)]
test_genes_hv = test_genes[test_genes.gene_id.isin(hv_genes)]
train_genes_hv = train_genes[train_genes.gene_id.isin(hv_genes)]

In [42]:
print(len(val_genes_hv))
print(len(test_genes_hv))
print(len(train_genes_hv))

324
273
2272


In [44]:
val_genes.to_csv(os.path.join(data_path, 'onek1k_training_data', 'val_genes.csv'))
test_genes.to_csv(os.path.join(data_path, 'onek1k_training_data', 'test_genes.csv'))
train_genes.to_csv(os.path.join(data_path, 'onek1k_training_data', 'train_genes.csv'))

In [45]:
val_genes_hv.to_csv(os.path.join(data_path, 'onek1k_training_data', 'val_genes_hv.csv'))
test_genes_hv.to_csv(os.path.join(data_path, 'onek1k_training_data', 'test_genes_hv.csv'))
train_genes_hv.to_csv(os.path.join(data_path, 'onek1k_training_data', 'train_genes_hv.csv'))

### Write genes sequences

In [13]:
from scooby.utils.transcriptome import Transcriptome

In [14]:
val_genes = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'val_genes.csv'), index_col=0)
test_genes = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'test_genes.csv'), index_col=0)
train_genes = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'train_genes.csv'), index_col=0)

In [15]:
transcriptome = Transcriptome(gtf_file, use_geneid=True)

In [17]:
context_length = 524288

In [18]:
def get_gene_span(transcriptome, gene_name):
    gene = transcriptome.genes[gene_name]
    center_pos = gene.span()[0] + (gene.span()[1] - gene.span()[0])//2 #middle
    start = center_pos - context_length // 2
    chrom = gene.chrom
    seq_out_start = start + (context_length-(6144*32))/2
    seq_out_len = 6144*32
    return     {'Chromosome': chrom, 'Start': int(seq_out_start), 'End': int(seq_out_start + seq_out_len), 'gene_name': gene_name, 'Strand': gene.strand}

In [21]:
val_bed_df = pd.DataFrame([get_gene_span(transcriptome, gene_name) for gene_name in val_genes.gene_id])

val_bed_df = pl.DataFrame(val_bed_df)

print(val_bed_df.shape)
val_bed_df.write_csv(os.path.join(data_path,'onek1k_training_data', 'val_gene_sequences.csv'), separator="\t", include_header=False)

(3539, 5)


In [22]:
test_bed_df = pd.DataFrame([get_gene_span(transcriptome, gene_name) for gene_name in test_genes.gene_id])
test_bed_df = pl.DataFrame(test_bed_df)
print(test_bed_df.shape)
test_bed_df.write_csv(os.path.join(data_path,'onek1k_training_data', 'test_gene_sequences.csv'), separator="\t", include_header=False)

(3129, 5)


In [23]:
train_bed_df = pd.DataFrame([get_gene_span(transcriptome, gene_name) for gene_name in train_genes.gene_id])
train_bed_df = pl.DataFrame(train_bed_df)
print(train_bed_df.shape)
train_bed_df.write_csv(os.path.join(data_path,'onek1k_training_data', 'train_gene_sequences.csv'), separator="\t", include_header=False)

(23783, 5)


In [24]:
all_bed_df = pl.concat([train_bed_df, val_bed_df, test_bed_df])

In [25]:
all_bed_df.write_csv(os.path.join(data_path,'onek1k_training_data', 'train_val_test_gene_sequences.csv'), separator="\t", include_header=False)

### Write hv genes

In [26]:
val_genes_hv = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'val_genes_hv.csv'), index_col=0)
test_genes_hv = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'test_genes_hv.csv'), index_col=0)
train_genes_hv = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'train_genes_hv.csv'), index_col=0)

In [27]:
val_bed_df_hv = pd.DataFrame([get_gene_span(transcriptome, gene_name) for gene_name in val_genes_hv.gene_id])

In [28]:
val_bed_df_hv = pl.DataFrame(val_bed_df_hv)

In [29]:
print(val_bed_df_hv.shape)
val_bed_df_hv.write_csv(os.path.join(data_path,'onek1k_training_data', 'val_gene_hv_sequences.csv'), separator="\t", include_header=False)

(324, 5)


In [30]:
test_bed_df_hv = pd.DataFrame([get_gene_span(transcriptome, gene_name) for gene_name in test_genes_hv.gene_id])
print(test_bed_df_hv.shape)

(273, 5)


In [31]:
test_bed_df_hv = pl.DataFrame(test_bed_df_hv)

In [32]:
test_bed_df_hv.write_csv(os.path.join(data_path,'onek1k_training_data', 'test_gene_hv_sequences.csv'), separator="\t", include_header=False)

In [33]:
train_bed_df_hv = pd.DataFrame([get_gene_span(transcriptome, gene_name) for gene_name in train_genes_hv.gene_id])
print(train_bed_df_hv.shape)

(2272, 5)


In [34]:
train_bed_df_hv = pl.DataFrame(train_bed_df_hv)

In [35]:
train_bed_df_hv.write_csv(os.path.join(data_path,'onek1k_training_data', 'train_gene_hv_sequences.csv'), separator="\t", include_header=False)

In [36]:
train_bed_df_hv.shape

(2272, 5)

In [37]:
all_bed_df_hv = pl.concat([train_bed_df_hv, val_bed_df_hv, test_bed_df_hv])

In [38]:
all_bed_df_hv.write_csv(os.path.join(data_path,'onek1k_training_data', 'train_val_test_gene_hv_sequences.csv'), separator="\t", include_header=False)

# Run scPoli on the filtered adata

In [8]:
from scarches.models.scpoli import scPoli
import pandas as pd
import anndata as ad
import scanpy as sc
import os
import numpy as np

In [2]:
from lightning.pytorch.loggers import WandbLogger
import wandb

In [17]:
adata = sc.read(os.path.join(data_path, 'onek1k_training_data', 'OneK1K_only_immune.h5ad'))

In [18]:
val_genes = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'val_genes.csv'), index_col=0)
test_genes = pd.read_csv(os.path.join(data_path, 'onek1k_training_data', 'test_genes.csv'), index_col=0)

In [19]:
adata_train = adata[:, (~adata.var_names.isin(val_genes.gene_id.to_list() + test_genes.gene_id.to_list()))]

In [20]:
adata_train = adata_train[:, adata_train.var.highly_variable].copy()

In [21]:
adata_train 

View of AnnData object with n_obs × n_vars = 1267768 × 2403
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age', 'sample', 'barcode_sample'
    var: 'GeneSymbol', 'features', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg'

In [48]:
wandb_logger = WandbLogger(project="onek1k", entity='lauradm')

In [44]:
scpoli_model = scPoli(
    adata=adata_train,
    condition_keys='sample',
    cell_type_keys='cell_label',
    recon_loss='nb',
)

Embedding dictionary:
 	Num conditions: [75]
 	Embedding dim: [10]
Encoder Architecture:
	Input Layer in, out and cond: 2403 50 10
	Mean/Var Layer in/out: 50 10
Decoder Architecture:
	First Layer in, out and cond:  10 50 10
	Output Layer in/out:  50 2403 



In [49]:

early_stopping_kwargs = {
    "early_stopping_metric": "val_prototype_loss",
    "mode": "min",
    "threshold": 0,
    "patience": 20,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}

scpoli_model.train(
    n_epochs=50,
    pretraining_epochs=40,
    early_stopping_kwargs=early_stopping_kwargs,
    eta=5, logger=wandb_logger
)

INFO:scarches.trainers.scpoli.trainer:GPU available: True, GPU used: True


Initializing dataloaders
Starting training
 |████████████████████| 100.0%  - val_loss:  220.08 - val_cvae_loss:  209.78 - val_prototype_loss:   10.30 - val_labeled_loss:    2.06


In [21]:
data_path

'/data/ceph/hdd/project/node_09/semi_supervised_multispecies/Downstream/single_cell/data/onek1k_bam_files'

In [50]:
scpoli_model.save(f"{data_path}/onek1k_training_data/scpoli_model_default.pkl")

In [54]:
scpoli_model = scpoli_model.load(f"{data_path}/onek1k_training_data/scpoli_model.pkl", adata=adata_train)

  model_state_dict = torch.load(model_path, map_location=map_location)


AnnData object with n_obs × n_vars = 1267768 × 2403
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age', 'sample', 'barcode_sample', 'conditions_combined'
    var: 'GeneSymbol', 'features', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg'
Embedding dictionary:
 	Num conditions: [75]
 	Embedding dim: [5]
Encoder Architecture:
	Input Layer in, out and cond: 2403 1024 5
	Mean/Var Layer in/out: 1024 10
Decoder Architecture:
	First Layer in, out and cond:  10 1024 5
	Output Layer in/out:  1024 2403 



In [55]:
X_emb = scpoli_model.get_latent(
    adata_train,
    mean=True
)

In [31]:
adata_train.obsm['X_scpoli'] = X_emb

In [32]:
adata_train.obsm['X_umap'] = np.stack(X_umap['umap'])

In [None]:
# sc.pp.neighbors(adata_train, use_rep='X_scpoli')

# sc.tl.umap(adata_train)

# sc.pl.umap(adata_train, color='cell_label')

# Save embedding and neighborhood graph

In [74]:
import pickle
import scipy.sparse
import pandas as pd
import numpy as np

In [70]:
embedding = pd.DataFrame({'embedding':list(adata_train.obsm['X_scpoli']), 'obs_names': adata_train.obs_names})

In [71]:
embedding.to_parquet(os.path.join(data_path, 'onek1k_training_data', 'embedding_no_val_genes.pq'))

In [73]:
umap = pd.DataFrame({'umap':list(adata_train.obsm['X_umap']), 'obs_names': adata_train.obs_names})

## Write cell_type_index

In [169]:
cellindex = [
    {'celltype': cell_type, 
     'cellindex': list(np.where(adata.obs.tissue_label == cell_type)[0])} for cell_type in adata.obs.tissue_label.unique() if cell_type is not np.nan
]

In [171]:
pd.DataFrame(cellindex).to_parquet(os.path.join(data_path, 'celltype_fixed.pq'))

## Write empty neighbors

In [8]:
from scipy import sparse

In [9]:
empty_csr = sparse.csr_matrix((adata.shape[0], adata.shape[0])) 

In [10]:
sparse.save_npz(os.path.join(data_path, f"no_neighbors.npz"), empty_csr)