In [None]:
import anndata
import numpy as np
import re
import scipy
import scipy.sparse
import urllib

from zipfile import ZipFile

from rp2 import fetch_file, get_data_path, get_scripts_path

Download supplementary data for Hagai *et al.* (2018)

In [None]:
hagai_path = get_data_path("hagai_2018")
fetch_file(
    "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-018-0657-2/MediaObjects/41586_2018_657_MOESM4_ESM.xlsx",
    hagai_path
);

Download Hagai *et al.* (2018) datasets from ArrayExpress

In [None]:
ae_path = get_data_path("ArrayExpress")
fetch_file(
    "https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-6754/E-MTAB-6754.processed.2.zip",
    ae_path
);

Extract zipped ArrayExpress datasets

In [None]:
for zip_path in ae_path.glob("*.zip"):
    extract_path = ae_path.joinpath(zip_path.stem)
    if extract_path.exists():
        continue

    print("Extracting:", zip_path)

    with ZipFile(zip_path, "r") as zip_file:
        zip_file.extractall(extract_path)

Download txburst scripts for Larsson *et al.* (2019) burst modelling

In [None]:
txburst_filenames = (
    "txburstML.py",
    "txburstPL.py",
    "txburstTEST.py",
)

txburst_path = get_scripts_path("txburst")

for filename in txburst_filenames:
    url = f"https://raw.githubusercontent.com/sandberg-lab/txburst/master/{filename}"
    fetch_file(url, txburst_path)

Collate Hagai *et al.* (2018) mouse UMI counts into a single dataframe

In [None]:
def extract_species_and_replicate(token):
    return token[:-1], token[-1]


def extract_treatment_and_time_point(token):
    matches = re.match(r"^([a-z]+)(\d*[^\d]*)$", token)
    return matches.group(1), matches.group(2) or "0"


def load_umi_count_adata(file_path):
    species_and_replicate, treatment_and_time_point = file_path.name.split("_")[:2]
    species, replicate = extract_species_and_replicate(species_and_replicate)
    treatment, time_point = extract_treatment_and_time_point(treatment_and_time_point)

    umi_ad = anndata.read_csv(file_path, delimiter=" ").T
    umi_ad.X = scipy.sparse.csr_matrix(umi_ad.X, dtype=np.int)

    umi_ad.obs["species"] = species
    umi_ad.obs["replicate"] = replicate
    umi_ad.obs["treatment"] = treatment
    umi_ad.obs["time_point"] = time_point
    umi_ad.obs["barcode"] = umi_ad.obs.index

    return umi_ad


def collate_umi_counts(csv_file_paths):
    total_obs = 0
    all_adata = []

    for file_path in csv_file_paths:
        print("Loading", file_path.name)

        csv_adata = load_umi_count_adata(file_path)
        csv_adata.obs.index = [str(i) for i in range(total_obs, total_obs + csv_adata.n_obs)]

        all_adata.append(csv_adata)

        total_obs += csv_adata.n_obs

    print("Collating")

    adata = all_adata[0].concatenate(all_adata[1:], join="outer", index_unique=None)
    adata.obs.drop(columns=["batch"], inplace=True)
    return adata

In [None]:
species_of_interest = ["mouse"]

umi_files_path = ae_path.joinpath("E-MTAB-6754.processed.2")

for species in species_of_interest:
    umi_file_path = umi_files_path.parent.joinpath(umi_files_path.name + f".{species}.h5ad")

    if not umi_file_path.exists():
        csv_glob = umi_files_path.glob(f"{species}*.txt.gz")
        adata = collate_umi_counts(csv_glob)
        adata.write_h5ad(umi_file_path)

Download a list of mouse genes (Ensembl ID, symbol and description) from BioMart

In [None]:
biomart_path = get_data_path("BioMart")
mouse_genes_tsv_path = biomart_path.joinpath("mouse_genes.tsv")

if not mouse_genes_tsv_path.exists():
    biomart_query = '<?xml version="1.0" encoding="UTF-8"?>' \
                    '<!DOCTYPE Query>' \
                    '<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6">' \
                    '<Dataset name="mmusculus_gene_ensembl" interface="default">' \
                    '<Attribute name="ensembl_gene_id" />' \
                    '<Attribute name="external_gene_name" />' \
                    '<Attribute name="description" />' \
                    '</Dataset>' \
                    '</Query>'
    biomart_url = "http://mar2016.archive.ensembl.org/biomart/martservice?query=" + urllib.parse.quote(biomart_query)
    fetch_file(biomart_url, mouse_genes_tsv_path.parent, rename_to=mouse_genes_tsv_path.name)