In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import rp2.data
from rp2 import hagai_2018, create_gene_symbol_map
from rp2.paths import get_output_path, get_txburst_results_path

rp2.check_environment()

# Initial analysis

The "geneal analysis" below may not be accounting for some potential issues in the txburst results. These problems are explored in this section. First, load results intended for downstream analysis and determine which points appear to be "valid".

In [None]:
condition_columns = ["replicate", "treatment", "time_point"]

txburst_df = rp2.data.load_txburst_results("mouse", condition_columns, "median")

txburst_df["valid_points"] = txburst_df.bs_point.notna() & txburst_df.bf_point.notna()
txburst_df["valid_bs_interval"] = txburst_df.bs_lower.notna() & txburst_df.bs_upper.notna()
txburst_df["valid_bf_interval"] = txburst_df.bf_lower.notna() & txburst_df.bf_upper.notna()
txburst_df["valid_intervals"] = txburst_df.valid_bs_interval & txburst_df.valid_bf_interval
display(txburst_df[["keep"] + [c for c in txburst_df.columns if c.startswith("valid_")]].agg(np.count_nonzero))

The txburst [GitHub page](https://github.com/sandberg-lab/txburst) states that "burst frequency = k_on" and "burst size = k_syn/k_off". Is this true for all the points that are flagged as valid? Plotting the point estimates against their k-parameter-derived equivalents should produce lines of identity:

In [None]:
txburst_valid_subset = txburst_df.loc[txburst_df.valid_points]

bf = txburst_valid_subset.bf_point
bf2 = txburst_valid_subset.k_on
bs = txburst_valid_subset.bs_point
bs2 = txburst_valid_subset.k_syn / txburst_valid_subset.k_off

_, (bf_ax, bs_ax) = plt.subplots(ncols=2, figsize=(10, 5))
bf_ax.scatter(bf, bf2, c="orange")
bf_ax.plot((0, 15), (0, 15), "k:")
bf_ax.set_xlabel("Burst frequency point estimate")
bf_ax.set_ylabel("$k_{syn}/k_{off}$")
bs_ax.scatter(bs, bs2)
bs_ax.plot((0, 1100), (0, 1100), "k:")
bs_ax.set_xlabel("Burst size point estimate")
bs_ax.set_ylabel("$k_{on}$")
plt.tight_layout()
plt.show()

A considerable number of points deviate from the expected identity trends. Furthermore, some burst size point estimates are below 1 which is unexpected given the $k_{syn}/k_{off}$ ratio and its implication that RNA molecules would not be synthesised.

In [None]:
print(f"Size range: {np.min(bs):.4f} - {np.max(bs):.4f}")
print(f"Alt. size range: {np.min(bs2):.4f} - {np.max(bs2):.4f}")

txburst_valid_subset.plot.scatter("k_syn", "k_off", c="orange")
plt.plot((0, 1000), (0, 1000), "k--")
plt.show()

These observations suggest it may be preferable to manually calculate burst size and frequency from the k-parameters rather than depending upon the txburst point estimates. Interestingly, code in at least [one notebook on the txburst repo](https://github.com/sandberg-lab/txburst/blob/master/Fig%201b.ipynb) manually calculates the burst parameters rather than reading them from the available point estimates:
```python
PL_CIs = pd.read_pickle('data/SS3_cast_UMIs_concat_PL.pkl')

bf = pd.Series([p[0] for p in PL_CIs[0]], index=PL_CIs.index)
bs = pd.Series([p[2]/p[1] for p in PL_CIs[0]], index=PL_CIs.index)
```

Plotting these values against the corresponding point estimates leads to observations similar to those made here.

If the burst parameters are to be calculated explicitly then it may be prudent to use an equation for frequency accounting for 2 allales.

In [None]:
bf3 = (2 * txburst_valid_subset.k_on * txburst_valid_subset.k_off) / (txburst_valid_subset.k_on + txburst_valid_subset.k_off)

plt.scatter(bf2, bf3, c="orange")
plt.plot((0, 26), (0, 26), "k:")
plt.xlabel("Burst frequency (gene)")
plt.ylabel("Burst frequency (2 alleles)")
plt.show()

# General analysis

In [None]:
study_species = "mouse"
study_treatment_set = "lps"
study_lr_method = "ols"

all_index_columns = ["gene"] + condition_columns

Load lists of parameters calculated by txburst (and disregard time point "6A")

In [None]:
txburst_path = get_txburst_results_path()

txburst_results = {}
for csv_path in txburst_path.glob(f"*species={study_species}-*.csv"):
    print(f"Loading: {csv_path.name}")
    df = pd.read_csv(csv_path)
    df = df.loc[df.time_point != "6A"]
    txburst_results[csv_path.stem] = df

Summarise the numbers of genes and conditions with burst parameters for each set of results

In [None]:
for name, txburst_params_df in txburst_results.items():
    def print_condition_count(df, title):
        print(f"  {title}: {len(df):,} conditions across {df.gene.nunique():,} genes")

    print(f"{name}:")
    print_condition_count(txburst_params_df, "Total")
    print_condition_count(txburst_params_df.loc[txburst_params_df.keep], "Kept")
    print_condition_count(txburst_params_df.loc[~txburst_params_df.bs_point.isna()], "With size")
    print_condition_count(txburst_params_df.loc[~txburst_params_df.bf_point.isna()], "With frequency")

    txburst_params_valid_subset = txburst_params_df.loc[~txburst_params_df.bs_point.isna() & ~txburst_params_df.bf_point.isna()]

    n_genes = txburst_params_df.gene.nunique()
    n_genes_without_params = n_genes - txburst_params_valid_subset.gene.nunique()
    print(f"  {n_genes_without_params:,} of {n_genes:,} genes have no burst params")
    print("  Distribution of those that do:")

    sns.countplot(
        x="n_conditions",
        data=txburst_params_valid_subset.gene.value_counts().to_frame("n_conditions"),
    )
    plt.show()

    print("  Larsson et al. fig 1b plot:")

    sns.scatterplot(
        x="bs_point",
        y="bf_point",
        hue="time_point",
        data=txburst_params_valid_subset,
    )
    plt.xlabel("Burst size")
    plt.xscale("log", basex=10)
    plt.axvline(x=1, linestyle="--")
    plt.axvline(x=100, linestyle="--")
    plt.ylabel("Burst frequency")
    plt.yscale("log", basey=10)
    plt.axhline(y=0.001, linestyle="--")
    plt.axhline(y=10, linestyle="--")
    plt.show()

    print("  Larsson et al. fig 1c plot:")

    plt.hist(
        np.log10(txburst_params_valid_subset.bf_point),
        bins=40,
    )
    plt.xlabel("log$_{10}$(burst frequency)")
    plt.axvline(x=-3, linestyle="--")
    plt.axvline(x=2, linestyle="--")
    plt.ylabel("Number of genes")
    plt.show()

    print("  Larsson et al. fig 1d plot:")

    plt.hist(
        np.log10(txburst_params_valid_subset.bs_point),
        bins=40,
    )
    plt.xlabel("log$_{10}$(burst size)")
    plt.axvline(x=0, linestyle="--")
    plt.axvline(x=2, linestyle="--")
    plt.ylabel("Number of genes")
    plt.show()

Load the per condition statistics calculated for QCed genes

In [None]:
condition_stats_df = pd.read_csv(get_output_path(f"{study_species}_{study_treatment_set}_stats_per_condition_per_gene.csv"))
condition_stats_df.time_point = condition_stats_df.time_point.astype(str)
print(f'QCed "{study_treatment_set}" treatment has {len(condition_stats_df):,} conditions across {condition_stats_df.gene.nunique():,} genes')

Load the linear regression parameters calculated per gene

In [None]:
lr_fit_df = pd.read_csv(get_output_path(f"{study_species}_{study_treatment_set}_lr_fit_per_gene.csv"), index_col="gene")
lr_fit_df = lr_fit_df.loc[lr_fit_df.method == study_lr_method]

Create a gene ID-to-symbol map

In [None]:
symbol_map = create_gene_symbol_map(study_species)

Display plots combining per condition statistics and burst parameters for each set of txburst results

In [None]:
for name, txburst_params_df in txburst_results.items():
    txburst_params_valid_subset = txburst_params_df.loc[txburst_params_df.keep]
    print(f"{name}:")

    index_columns = [column for column in txburst_params_valid_subset.columns if column in all_index_columns]
    combined_df = condition_stats_df.set_index(index_columns).join(txburst_params_valid_subset.set_index(index_columns), how="inner").reset_index()
    print(f"  Shares {len(combined_df):,} conditions across {combined_df.gene.nunique():,} genes")

    genes_df = combined_df.gene.value_counts().to_frame("n_conditions").join(lr_fit_df, how="inner")

    sns.countplot(
        x="n_conditions",
        data=genes_df,
    )
    plt.show()

    sns.boxplot(
        x="n_conditions",
        y="r2",
        data=genes_df,
    )
    plt.show()

    sorted_r2 = genes_df.loc[genes_df.n_conditions >= 9].r2.sort_values(ascending=False)

    plot_genes = sorted_r2[:5].index
    plot_column_names = ["k_on", "k_off", "k_syn", "bs_point", "bf_point"]

    n_plot_rows = len(plot_column_names)
    n_plot_columns = len(plot_genes)

    _, axes = plt.subplots(
        n_plot_rows,
        n_plot_columns,
        sharex="col",
        figsize=(4 * n_plot_columns, 4 * n_plot_rows)
    )
    for ci, gene_id in enumerate(plot_genes):
        for ri, column_name in enumerate(plot_column_names):
            ax = axes[ri, ci]
            sns.scatterplot(
                x="mean",
                y=column_name,
                style="outlier",
                data=combined_df.loc[combined_df.gene == gene_id],
                ax=ax,
            )
            ax.set_xlim(left=0)
            ax.set_ylim(bottom=0)
            if ri == 0:
                ax.set_title(symbol_map.lookup(gene_id))
    for ax in axes[:, 1:].flat:
        ax.set_ylabel(None)

    plt.show()

List conditions in which "keep" is True but no burst size is determined

In [None]:
gene_info_df = hagai_2018.load_biomart_gene_symbols_df(study_species)

for name, txburst_params_df in txburst_results.items():
    print(f"{name}:")

    txburst_params_df = txburst_params_df.copy()
    txburst_params_df.insert(1, "symbol", gene_info_df.symbol[txburst_params_df.gene].values)

    no_size_subset = txburst_params_df.loc[txburst_params_df.keep & txburst_params_df.bs_point.isna()]
    sort_columns = [c for c in ["symbol", "replicate", "time_point", "treatment"] if c in no_size_subset.columns]
    print(f"  {len(no_size_subset):,} conditions")
    display(no_size_subset.sort_values(by=sort_columns))