In [2]:
import scanpy as sc
import numpy as np
import pandas as pd

In [3]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures
sc.logging.print_version_and_date()
sc.set_figure_params(figsize=(12,12))

In [4]:
#adata = sc.read_h5ad(snakemake.input[0])
adata = sc.read_10x_h5("data/tmp/singlecell/quant/aggregate/cellranger/all_samples/outs/count/filtered_feature_bc_matrix.h5")

In [5]:
adata.var['gene_symbols'] = adata.var.index
adata.var_names_make_unique()

In [6]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=5)

In [7]:
#adata = adata[~adata.obs['predicted_doublets'],:]

In [8]:
np.shape(adata.X.sum(axis=1))

In [9]:
adata.obs["n_counts"] = adata.X.sum(axis=1)

In [10]:
data = adata

In [11]:
data.var['mt'] = data.var["gene_symbols"].str.startswith('MT-').astype(bool)  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(data, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [12]:
data.obs.head()

In [13]:
sc.pl.violin(data, ['n_genes', 'n_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [14]:
sc.pl.scatter(data, x='n_counts', y='pct_counts_mt')
sc.pl.scatter(data, x='n_counts', y='n_genes')

In [15]:
data = data[data.obs['pct_counts_mt'] < 15,:]
data = data[data.obs['n_counts'] > 1e3,:]
data = data[data.obs['n_counts'] < 25e3,:]
sc.pl.scatter(data, x='n_counts', y='pct_counts_mt')
sc.pl.scatter(data, x='n_counts', y='n_genes')

In [16]:
sc.pl.violin(data, ['n_genes', 'n_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True, save="_post_filter.pdf")

In [17]:
data

In [18]:
data.obs["n_counts"].mean()

In [19]:
sc.pp.normalize_per_cell(data, counts_per_cell_after=1e6)
sc.pl.highest_expr_genes(data, n_top=10, gene_symbols='gene_symbols')

In [20]:
sc.pp.log1p(data)

In [21]:
sc.pp.highly_variable_genes(data, n_top_genes=3000)

In [22]:
sc.pl.highly_variable_genes(data)

In [23]:
sc.tl.pca(data)

In [24]:
sc.pl.pca_scatter(data, color=['n_counts', 'n_genes','pct_counts_mt'])

In [25]:
sc.pp.neighbors(data, n_pcs = 30)
sc.tl.louvain(data)
sc.tl.umap(data)

In [26]:
sc.pl.umap(data, color=['n_counts', 'n_genes','pct_counts_mt'], s=50)

In [27]:
with open(snakemake.output[0],'w+') as of:
    of.write("done")


In [28]:
data.var.index = data.var['gene_symbols']
data.var.index.name = 'index'

In [29]:
data.write(snakemake.output[0])

In [30]:
az = pd.read_csv("data/tmp/singlecell/quant/aggregate/cellranger/azimuth_pred.tsv", sep='\t', index_col=0, header=0)
az.head()


In [31]:
test = set([x.split("-")[0] for x in az.index])

In [32]:
len(test)

In [33]:
len(data.obs)

In [34]:
testobs = set([x.split("-")[0] for x in data.obs.index])

In [35]:
len(testobs)

In [36]:
data.obs.head()

In [38]:
data.obs = data.obs.join(az)

In [40]:
sc.pl.umap(data, color=['predicted.celltype.l2'], s=50)