In [None]:
import itertools

import matplotlib.pyplot as plt
import numpy as np

import rp2
from rp2 import hagai_2018, create_folder
from rp2.paths import get_output_path

rp2.check_environment()

## Extract UMI counts from Hagai *et al.* (2018) dataset

Load full UMI counts in preparation to extract subsets.

In [None]:
available_species = ["mouse"]

umi_ad_map = {species: hagai_2018.load_umi_counts_with_additional_annotation(species)
              for species in available_species}

To extract UMI counts for specific genes and conditions:
1. Ensure all previous cells have been executed
2. Edit the values of extract_conditions (and show_histograms) accordingly in the cell below
3. Run the cell below

Output files will be created in *Output/Misc/UmiCounts*.

Note that it is possible to combine conditions (but not genes) by creating a list of lists. For example
```python
replicate = [["1", "2"], "3"]
```
will create results with replicates ! and 2 combined and 3 as a separate condition.

In [None]:
output_path = get_output_path("Misc", "UmiCounts")
create_folder(output_path)

extract_conditions = dict(
    species=["mouse"],
    gene=["tnf"],
    replicate=["1", "2", "3"],
    treatment=[["unst", "lps"]],
    time_point=["0", "2", "4", "6"],
)

show_histograms = True

for values in itertools.product(*extract_conditions.values()):
    extract_dict = dict(zip(extract_conditions.keys(), values))

    species = extract_dict.pop("species")
    gene_symbol = extract_dict.pop("gene").lower()

    umi_ad = umi_ad_map[species]

    umi_ad = umi_ad[:, umi_ad.var.symbol.str.lower() == gene_symbol]
    if umi_ad.n_vars != 1:
        if umi_ad.n_vars == 0:
            print(f"  No genes with symbol '{gene_symbol}'")
        else:
            print(f"  Multiple genes with symbol '{gene_symbol}':")
            for name in umi_ad:
                print(f"  {name}")
        continue

    file_prefix = f"species={species}-gene={gene_symbol}"

    for k, v in extract_dict.items():
        if not isinstance(v, list):
            v = [v]
        umi_ad = umi_ad[umi_ad.obs[k].isin(v), :]

        file_prefix += f"-{k}={'+'.join(v)}"

    filename = file_prefix + ".txt"
    print(filename)
    print(f"  {len(umi_ad):,} samples")

    counts = umi_ad.X.A.squeeze().astype(np.int)
    np.savetxt(output_path.joinpath(filename), counts, fmt="%d")

    if show_histograms:
        plt.hist(counts, log=True)
        plt.xlabel("UMI count")
        plt.ylabel("Frequency")
        plt.show()

## UMI count to transcript number

From Grün *et al.* (2014):

**Conversion of UMI count to transcript number.**

For each gene $i$, $k_{o,i}$ denotes the number of observed UMIs and $k_{n,i}$ the number of non-observed UMIs. The total number $K$ of UMIs is given by

$K=k_{o,i}+k_{n,i}$

the number of sequenced transcripts $m_i$

$m_i=\frac{\ln{\left(1-\frac{k_{o,i}}{K}\right)}}{\ln{\left(1-\frac{1}{K}\right)}}\cong-K\ln{\left(1-\frac{k_{o,i}}{K}\right)}$

Define a function implementing this formula:

In [None]:
def umi_to_transcript_count(ko, K, approx=False):
    ko = np.asarray(ko, dtype=np.float)
    num = np.log(1 - (ko / K))
    if approx:
        return -K * num
    den = np.log(1 - (1 / K))
    return num / den

Plot the mapping from UMI count to transcript number:

In [None]:
tenx_umi_bases = 10
tenx_umi_max = 4**tenx_umi_bases
print(f"{tenx_umi_bases} bases in 10X experiments permits up to {tenx_umi_max:,} UMIs")

umi_counts = np.arange(tenx_umi_max)
transcript_numbers = umi_to_transcript_count(umi_counts, tenx_umi_max).astype(np.int)

plt.plot(umi_counts, transcript_numbers)
plt.xlabel("UMI count")
plt.ylabel("Transcript number")
plt.show()

In [None]:
identical_until = np.argwhere(umi_counts == transcript_numbers).max()
print(f"The UMI-transcript transform is an identity mapping for UMI counts up to {identical_until:,}")

plt.plot(umi_counts, transcript_numbers)
plt.xlabel("UMI count")
plt.ylabel("Transcript number")
plt.axvline(x=identical_until, ls="--")
plt.axhline(y=identical_until, ls=":")
plt.xlim(0, identical_until * 4)
plt.ylim(0, identical_until * 4)
plt.show()

In [None]:
umi_count_ad = umi_ad_map["mouse"]
umi_count_ad.var["max"] = umi_count_ad.X.max(axis=0).A.squeeze().astype(np.int)

high_count_var = umi_count_ad.var.loc[umi_count_ad.var["max"] >= identical_until].sort_values(by="max", ascending=False)
print(f"{len(high_count_var):,} genes out of {umi_count_ad.n_vars:,} have UMI count >= {identical_until:,}")

for i, row in enumerate(high_count_var.itertuples(), start=1):
    print(f"  {i}. {row.symbol}: max UMI of {row.max:,} maps to {transcript_numbers[row.max]:,} transcripts")