In [None]:
from IPython.display import Markdown as md
from pathlib import Path
import pandas as pd
import seaborn as sns
import pegasus as pg
import plotly.express as px
import logging, sys, os

sys.path.append((os.path.abspath("../workflow")))
from src.plot_utils import pretty_table

logging.getLogger("pegasus").setLevel(logging.WARNING)
logging.getLogger("pegasusio").setLevel(logging.WARNING)

In [None]:
md(f"# STARsolo report for {snakemake.wildcards.soloFeatures}")

In [None]:
runsheet = pd.read_csv(snakemake.config["runsheet"], sep="\t")

## Metrics

In [None]:
def read_metrics(fn: str):
    """Read and clean STARsolo metrics file, return a pandas dataframe."""

    metrics = pd.read_csv(fn, header=None).set_index(0).transpose()
    for c in metrics.columns:
        if "GeneFull" in c:
            metrics.rename({c: c.replace("GeneFull", "Genes")}, axis=1, inplace=True)
        else:
            metrics.rename({c: c.replace("Gene", "Genes")}, axis=1, inplace=True)
    for c in [
        "Estimated Number of Cells",
        "Unique Reads in Cells Mapped to Genes",
        "Number of Reads",
        "UMIs in Cells",
        "Total Genes Detected",
        "Median UMI per Cell",
        "Mean UMI per Cell",
        "Median Genes per Cell",
        "Mean Genes per Cell",
        "Median Reads per Cell",
        "Mean Reads per Cell",
    ]:
        metrics[c] = metrics[c].astype(int)

    return metrics

    # sequencing = metrics.iloc[:,0:5]
    # mapping = pd.concat([metrics.iloc[:,0:1], metrics.iloc[:,5:9], metrics.iloc[:,10:12]], axis=1)
    # cells = pd.concat(metrics.iloc[:,10], metrics.iloc[:,12:], axis=1)
    # return sequencing, mapping, cells

In [None]:
summ = []
for r in runsheet["run_id"].unique():
    # read in summary metrics
    for f in snakemake.input["summary"]:
        p = Path(f)
        if r != p.parent.parent.name:
            continue
        d = read_metrics(f)
        d["run_id"] = r
    summ.append(d)

summ = pd.concat(summ).set_index(["run_id"])

In [None]:
cm = sns.color_palette("flare", as_cmap=True)
pretty_table(summ, cm)

In [None]:
# make CSVs for raw and filtered 10x runs
for i in ["raw", "filtered"]:
    d = {"Sample": [], "Location": []}
    for r in runsheet["run_id"].unique():
        for f in snakemake.input[i]:
            if r == Path(f).parent.parent.parent.name:
                d["Sample"].append(r)
                d["Location"].append(f)
    if i == "raw":
        raw = pg.aggregate_matrices(d)
    else:
        filtered = pg.aggregate_matrices(d)

df = raw.obs
del raw
df = df.loc[df["n_counts"] > 0, :]  # remove zeros
df = df.sort_values(["Channel", "n_counts"], ascending=False)
df["isEmpty"] = ~df.index.isin(filtered.obs.index)
df["rank"] = df.groupby("Channel")["n_counts"].rank("first", ascending=False)

In [None]:
# plot barcode rank
# df.reset_index(inplace=True)
px.line(
    df,
    x="rank",
    y="n_counts",
    line_group="Channel",
    color="isEmpty",
    log_x=True,
    log_y=True,
    width=800,
    height=600,
    color_discrete_sequence=["purple", "gray"],
    title="Barcode Rank Plot",
)