In [None]:
import anndata
import pandas as pd
from IPython.core.display import display

from rp2 import create_folder, get_data_path, get_output_path, get_scripts_path, working_directory

As a proof of concept, create UMI .csv files for 10 genes per condition and run them through txburst scripts

In [None]:
study_species = "mouse"
study_treatment_set = "lps"

In [None]:
umi_count_ad = anndata.read_h5ad(get_data_path("ArrayExpress", f"E-MTAB-6754.processed.2.{study_species}.h5ad"))

In [None]:
lr_fit_df = pd.read_csv(get_output_path(f"{study_species}_{study_treatment_set}_lr_fit_per_gene.csv"), index_col=0)
best_lr_fits = lr_fit_df.r2.sort_values(ascending=False)[:10]
display(best_lr_fits)

In [None]:
txburst_files_path = get_output_path("txburst")
create_folder(txburst_files_path, create_clean=True)

for (replicate, treatment, time_point), df in umi_count_ad.obs.groupby(["replicate", "treatment", "time_point"]):
    csv_path = txburst_files_path.joinpath(f"{study_species}_umi_{replicate}_{treatment}_{time_point}.csv")
    subset_ad = umi_count_ad[df.index.values, best_lr_fits.index]
    subset_ad.to_df().T.to_csv(csv_path, index_label="gene")

In [None]:
txburst_script_path = get_scripts_path("txburst")

with working_directory(txburst_files_path):
    for full_csv_file_path in txburst_files_path.glob("*.csv"):
        csv_file_path = full_csv_file_path.name
        ml_file_path = full_csv_file_path.stem + "_ML.pkl"

        txburst_ml_script_path = txburst_script_path.joinpath("txburstML.py")
        txburst_pl_script_path = txburst_script_path.joinpath("txburstPL.py")

        ml_cmd = f"{txburst_ml_script_path} --njobs 4 {csv_file_path}"
        pl_cmd = f"{txburst_pl_script_path} --njobs 4 --file {csv_file_path} --MLFile {ml_file_path}"

        for cmd in [ml_cmd, pl_cmd]:
            print("Executing:", cmd)
            %run {cmd}

In [None]:
txburst_df = []

for pl_path in txburst_files_path.glob(f"{study_species}_*_PL.pkl"):
    replicate, treatment, time_point = pl_path.stem.split("_")[2:5]
    pl_df = pd.read_pickle(pl_path)

    condition_df = pd.DataFrame(data={
        "gene": pl_df.index,
        "replicate": replicate,
        "treatment": treatment,
        "time_point": time_point,
    })
    condition_df[["k_on", "k_off", "k_syn"]] = pd.DataFrame(pl_df.iloc[:, 0].to_list())
    condition_df[["bf_point", "bf_lower", "bf_upper"]] = pd.DataFrame(pl_df.iloc[:, 1].to_list())
    condition_df[["bs_point", "bs_lower", "bs_upper"]] = pd.DataFrame(pl_df.iloc[:, 2].to_list())

    txburst_df.append(condition_df)

txburst_df = pd.concat(txburst_df, ignore_index=True).sort_values(by=["gene", "replicate", "time_point", "treatment"])
display(txburst_df)

In [None]:
display(txburst_df.gene.value_counts().sort_values())

In [None]:
txburst_df.to_csv(get_output_path("burst_kinetics_fitting.csv"))