filter...
- samples
- cells
- genes

tutorial: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html

to look for:
- The number of unique genes detected in each cell.
  - Low-quality cells or empty droplets will often have very few genes
  - Cell doublets or multiplets may exhibit an aberrantly high gene count
- The percentage of reads that map to the mitochondrial genome
  - Low-quality / dying cells often exhibit extensive mitochondrial contamination
  - We use the set of all genes starting with MT- as a set of mitochondrial genes


In [1]:
import logging

import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io

In [3]:
uri = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE115978&format=file&file=GSE115978%5Fcell%2Eannotations%2Ecsv%2Egz"

metadata = pd.read_csv(
    # "gs://liulab/ftp/GSE115978/GSE115978_cell.annotations.csv",
    uri,
    compression="gzip",
    na_values={"cell.types": "?"},
)

In [4]:
metadata

Unnamed: 0,cells,samples,cell.types,treatment.group,Cohort,no.of.genes,no.of.reads
0,cy78_CD45_neg_1_B04_S496_comb,Mel78,Mal,post.treatment,Tirosh,8258,357919
1,cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb,Mel79,Mal,treatment.naive,Tirosh,2047,5727
2,CY88_5_B10_S694_comb,Mel88,Mal,post.treatment,Tirosh,5375,139218
3,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb,Mel79,Mal,treatment.naive,Tirosh,5648,73996
4,cy78_CD45_neg_3_H06_S762_comb,Mel78,Mal,post.treatment,Tirosh,7409,380341
...,...,...,...,...,...,...,...
7181,CY75_1_CD45_CD8_3__S168_comb_BCD8,Mel75,T.CD8,post.treatment,Tirosh,3530,137245
7182,CY75_1_CD45_CD8_8__S338_comb_BCD8,Mel75,T.CD8,post.treatment,Tirosh,3872,106432
7183,monika_D7_S132_comb_BCD8_3,Mel75,T.CD8,post.treatment,Tirosh,4589,908173
7184,CY75_1_CD45_CD8_8__S289_comb_BCD8,Mel75,T.CD8,post.treatment,Tirosh,4614,140903


In [5]:
import altair as alt

alt.Chart(metadata).mark_bar().encode(
    alt.X("no.of.genes:Q", bin=True),
    y="count()",
)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

In [None]:
sc_rnaseq.groupby("cell_id").sum("gene_is_nonzero")

In [None]:
import pathlib

figure_path = pathlib.Path("figures-9a")
figure_path.mkdir(parents=True, exist_ok=True)
figure_path

In [None]:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

In [None]:
rng = np.random.default_rng(seed=0)

In [None]:
%%time
sc_jerby_arnon, sc_metadata_jerby_arnon = helpers.datasets.load_jerby_arnon(
    ref_genome="hg19", units="tpm"
)
sc_jerby_arnon *= 1_000_000 / sc_jerby_arnon.sum()