In [None]:
import matplotlib.pyplot as plt
import numpy as np

from rp2 import hagai_2018
from rp2.environment import check_environment

check_environment()

## UMI count to transcript number

From Grün *et al.* (2014):

**Conversion of UMI count to transcript number.**

For each gene $i$, $k_{o,i}$ denotes the number of observed UMIs and $k_{n,i}$ the number of non-observed UMIs. The total number $K$ of UMIs is given by

$K=k_{o,i}+k_{n,i}$

the number of sequenced transcripts $m_i$

$m_i=\frac{\ln{\left(1-\frac{k_{o,i}}{K}\right)}}{\ln{\left(1-\frac{1}{K}\right)}}\cong-K\ln{\left(1-\frac{k_{o,i}}{K}\right)}$

Define a function implementing this formula:

In [None]:
def umi_to_transcript_count(ko, K, approx=False):
    ko = np.asarray(ko, dtype=np.float)
    num = np.log(1 - (ko / K))
    if approx:
        return -K * num
    den = np.log(1 - (1 / K))
    return num / den

Plot the mapping from UMI count to transcript number:

In [None]:
tenx_umi_bases = 10
tenx_umi_max = 4**tenx_umi_bases
print(f"{tenx_umi_bases} bases in 10X experiments permits up to {tenx_umi_max:,} UMIs")

umi_counts = np.arange(tenx_umi_max)
transcript_numbers = umi_to_transcript_count(umi_counts, tenx_umi_max).astype(np.int)

plt.plot(umi_counts, transcript_numbers)
plt.xlabel("UMI count")
plt.ylabel("Transcript number")
plt.show()

In [None]:
identical_until = np.argwhere(umi_counts == transcript_numbers).max()
print(f"The UMI-transcript transform is an identity mapping for UMI counts up to {identical_until:,}")

plt.plot(umi_counts, transcript_numbers)
plt.xlabel("UMI count")
plt.ylabel("Transcript number")
plt.axvline(x=identical_until, ls="--")
plt.axhline(y=identical_until, ls=":")
plt.xlim(0, identical_until * 4)
plt.ylim(0, identical_until * 4)
plt.show()

In [None]:
umi_count_ad = hagai_2018.load_umi_counts_with_additional_annotation("mouse")
umi_count_ad.var["max"] = umi_count_ad.X.max(axis=0).A.squeeze().astype(np.int)

high_count_var = umi_count_ad.var.loc[umi_count_ad.var["max"] >= identical_until].sort_values(by="max", ascending=False)
print(f"{len(high_count_var):,} genes out of {umi_count_ad.n_vars:,} have UMI count >= {identical_until:,}")

for i, row in enumerate(high_count_var.itertuples(), start=1):
    print(f"  {i}. {row.symbol}: max UMI of {row.max:,} maps to {transcript_numbers[row.max]:,} transcripts")