In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
import matplotlib.pyplot as plt

In [None]:
rank_plot_data = list()
STARsolo_summary_data = list()
IRescue_directories = list()
for i in snakemake.input:
    if Path(i).name == "Summary.csv":
        STARsolo_summary_data.append(i)
    elif Path(i).name == "matrix.mtx":
        rank_plot_data.append(i)
    elif Path(i).name == "IRescue":
        IRescue_directories.append(i)

In [None]:
filtered_barcodes = dict()
for i in rank_plot_data:
    # load data
    dir_10x = Path(i).parent
    adata = sc.read_10x_mtx(dir_10x, var_names="gene_symbols")
    name = dir_10x.parent.parent.stem + "_" + dir_10x.parent.stem + "_" + dir_10x.name
    print(f"plotting {name}...")

    # compute rankings
    plot_df = pd.DataFrame({"total_counts": np.sum(adata.X, axis=0).tolist()[0]})
    plot_df = plot_df.sort_values(by="total_counts", ascending=False)
    plot_df["total_counts"] = plot_df.loc[plot_df["total_counts"] > 0, :]
    plot_df["rank"] = range(1, plot_df.shape[0] + 1)
    adata_df = adata.to_df()
    filtered_barcodes[name] = adata_df[adata_df.sum(axis=1) <= 0].index.tolist()

    # plot
    plt.clf()
    fig = sns.lineplot(data=plot_df, x="rank", y="total_counts")
    fig.set(
        xscale="log", yscale="log", title=name, xlabel="Rank", ylabel="Total counts"
    )
    plt.show()

In [None]:
for i in STARsolo_summary_data:
    # Load data and set variables
    df = pd.read_csv(i, header=None)
    path = Path(i)
    name = path.parent.parent.stem + "_" + path.parent.stem + "_" + path.name
    reads = df.loc[0][1]
    mappedReads = df.loc[10][1]
    UMIs = df.loc[14][1]
    print(f"plotting {name}...")
    print(f".....{reads} number of reads........")
    print(f".....{mappedReads} number of unique mapped reads.....")
    print(f".....{UMIs} number of UMIs in Cells.....")

    # plot info for all reads
    sns.barplot(df[1:9], x=1, y=0, palette="Paired")
    plt.title(name + ": " + str(reads) + " Reads")
    plt.xlabel("Rate")
    plt.ylabel("")
    plt.show()
    plt.clf()

    # plot info for reads mapped to genes (two figures)
    fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
    axs[0] = sns.barplot(
        df[9:14].drop([10]).drop([11]), x=1, y=0, palette="Paired", ax=axs[0]
    )
    axs[1] = sns.barplot(df[11:12], x=0, y=1, palette="Paired", ax=axs[1])

    axs[0].set_title(
        name + ": " + str(mappedReads) + " Unique Reads in Cells Mapped to Gene"
    )
    axs[0].set_xlabel("")
    axs[0].set_ylabel("")
    axs[1].set_xlabel("")
    axs[1].set_ylabel("")

    plt.show()
    plt.clf()

    # plot info for genes and UMI per cell
    sns.barplot(df[15:], x=1, y=0, palette="Paired")
    plt.title(name + ": " + str(UMIs) + " UMIs in Cells")
    plt.xlabel("")
    plt.ylabel("")
    plt.show()
    plt.clf();

In [None]:
for i in IRescue_directories:
    # load data
    dir_10x = Path(i)
    adata = sc.read_10x_mtx(dir_10x, var_names="gene_symbols")
    name = dir_10x.parent.parent.stem + "_" + dir_10x.parent.stem + "_" + dir_10x.name
    print(f"plotting {name}...")

    # compute rankings
    plot_df = pd.DataFrame({"total_counts": np.sum(adata.X, axis=0).tolist()[0]})
    plot_df = plot_df.sort_values(by="total_counts", ascending=False)
    # plot_df["total_counts"] = plot_df.loc[plot_df["total_counts"] > 0, :]
    plot_df["rank"] = range(1, plot_df.shape[0] + 1)

    # plot
    plt.clf()
    fig = sns.lineplot(data=plot_df, x="rank", y="total_counts")
    fig.set(
        xscale="log", yscale="log", title=name, xlabel="Rank", ylabel="Total counts"
    )
    plt.show()