In [2]:
import scanpy as sc
import numpy as np
import pandas as pd

In [3]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures
sc.logging.print_version_and_date()
sc.set_figure_params(figsize=(12,12))

In [4]:
adata = sc.read_h5ad(snakemake.input[0])

In [8]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=5)

In [10]:
data = adata

In [12]:
data.var['mt'] = data.var["gene_symbols"].str.startswith(('MT', 'mt')).astype(bool)  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(data, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [13]:
sc.pl.violin(data, ['n_genes', 'n_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [14]:
sc.pl.scatter(data, x='n_counts', y='pct_counts_mt')
sc.pl.scatter(data, x='n_counts', y='n_genes')

In [15]:
data = data[data.obs['pct_counts_mt'] < 15,:]
data = data[data.obs['n_counts'] > 1e3,:]
data = data[data.obs['n_counts'] < 25e3,:]
sc.pl.scatter(data, x='n_counts', y='pct_counts_mt')
sc.pl.scatter(data, x='n_counts', y='n_genes')

In [16]:
sc.pl.violin(data, ['n_genes', 'n_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True, save="_post_filter.pdf")

In [17]:
data

In [18]:
data.obs["n_counts"].mean()

In [19]:
sc.pp.normalize_per_cell(data, counts_per_cell_after=1e6)
sc.pl.highest_expr_genes(data, n_top=10, gene_symbols='gene_symbols')

In [20]:
sc.pp.log1p(data)

In [21]:
sc.pp.highly_variable_genes(data, n_top_genes=3000)

In [22]:
sc.pl.highly_variable_genes(data)

In [23]:
sc.tl.pca(data)

In [24]:
sc.pl.pca_scatter(data, color=['n_counts', 'n_genes','pct_counts_mt'])

In [25]:
sc.pp.neighbors(data, n_pcs = 30)
sc.tl.louvain(data)
sc.tl.umap(data)

In [26]:
sc.pl.umap(data, color=['n_counts', 'n_genes','pct_counts_mt', 'sample_id'], s=50)

In [27]:
data.write(snakemake.output['preprocessed'])