In [2]:
from IPython.display import Markdown as md
from pathlib import Path
import pandas as pd
import seaborn as sns
import pegasus as pg
import plotly.express as px
import logging, sys, os

# get access to src module
if snakemake.config["istest"]:
    sys.path.append((os.path.abspath("../workflow")))
else:
    sys.path.append((os.path.abspath("workflow")))

from src.plot_utils import pretty_table

logging.getLogger("pegasus").setLevel(logging.WARNING)
logging.getLogger("pegasusio").setLevel(logging.WARNING)

In [None]:
md(f"# STARsolo report for {snakemake.wildcards.soloFeatures}")

In [4]:
runsheet = pd.read_csv(snakemake.config["runsheet"], sep="\t")

## Metrics

In [5]:
def read_metrics(fn: str):
    """Read and clean STARsolo metrics file, return a pandas dataframe."""

    metrics = pd.read_csv(fn, header=None).set_index(0).transpose()
    for c in metrics.columns:
        if "GeneFull" in c:
            metrics.rename({c: c.replace("GeneFull", "Genes")}, axis=1, inplace=True)
        else:
            metrics.rename({c: c.replace("Gene", "Genes")}, axis=1, inplace=True)
    for c in [
        "Estimated Number of Cells",
        "Unique Reads in Cells Mapped to Genes",
        "Number of Reads",
        "UMIs in Cells",
        "Total Genes Detected",
        "Median UMI per Cell",
        "Mean UMI per Cell",
        "Median Genes per Cell",
        "Mean Genes per Cell",
        "Median Reads per Cell",
        "Mean Reads per Cell",
    ]:
        metrics[c] = metrics[c].astype(int)

    return metrics

In [None]:
# remake input file into nested dictionary
samples = {}
for i in ["raw", "filtered", "summary"]:
    samples[i] = {"run": [], "file": []}
    for f in snakemake.input[i]:
        p = Path(f)
        if i == "summary":
            samples[i]["run"].append(p.parent.parent.name)
        else:
            samples[i]["run"].append(p.parent.parent.parent.name)
        samples[i]["file"].append(f)
    samples[i] = pd.DataFrame(samples[i])

In [None]:
summ = []
for s in samples["summary"].itertuples():
    d = read_metrics(s.file)
    d["run_id"] = s.run
    summ.append(d)

summ = pd.concat(summ).set_index(["run_id"])

In [None]:
cm = sns.color_palette("flare", as_cmap=True)
pretty_table(summ, cm)

In [None]:
to_agg = {"raw": [], "filtered": []}
for i in ["raw", "filtered"]:
    d = {"Sample": [], "Location": []}
    for s in samples[i].itertuples():
        d["Sample"].append(s.run)
        d["Location"].append(s.file)
    to_agg[i] = pd.DataFrame(d)

raw = pg.aggregate_matrices(to_agg["raw"], default_ref="GRCh38", min_umis=1)
df = raw.obs
del raw
df = df.sort_values(["Channel", "n_counts"], ascending=False)
filtered = pg.aggregate_matrices(to_agg["filtered"], default_ref="GRCh38")
df["isEmpty"] = ~df.index.isin(filtered.obs.index)
df["rank"] = df.groupby("Channel")["n_counts"].rank("first", ascending=False)
del filtered

In [None]:
# plot barcode rank
# df.reset_index(inplace=True)
px.line(
    df,
    x="rank",
    y="n_counts",
    line_group="Channel",
    color="isEmpty",
    log_x=True,
    log_y=True,
    width=800,
    height=600,
    color_discrete_sequence=["purple", "gray"],
    title="Barcode Rank Plot",
)