In [1]:
%reload_ext autoreload
%autoreload 2

In [None]:
import sys

import numpy as np
import pandas as pd


def get_logger(level="INFO"):
    from loguru import logger

    try:
        logger.remove(handler_id=0)
        logger.add(sink=sys.stdout, level=level, colorize=True)
    except ValueError:
        pass

    return logger


def set_notebook_options():
    import matplotlib.pyplot as plt
    import pandas as pd
    from matplotlib_inline.backend_inline import set_matplotlib_formats

    plt.matplotlib.rcParams["figure.dpi"] = 210
    set_matplotlib_formats("retina")

    pd.set_option("display.max_columns", 100)


logger = get_logger()
set_notebook_options()

In [None]:
# download data
!wget -N -P data/raw/ https://ernstlab.biolchem.ucla.edu/SHARPR/Scaleup_counts_sequences.zip

!wget -N -P data/raw/ https://ernstlab.biolchem.ucla.edu/SHARPR/HEPG2_SHARPR-MPRA_scores.zip
!wget -N -P data/raw/ https://ernstlab.biolchem.ucla.edu/SHARPR/K562_SHARPR-MPRA_scores.zip


# unzip data
!unzip -n -d data/raw/ data/raw/Scaleup_counts_sequences.zip

!unzip -n -d data/raw/ data/raw/HEPG2_SHARPR-MPRA_scores.zip
!unzip -n -d data/raw/ data/raw/K562_SHARPR-MPRA_scores.zip

In [None]:
def get_logratios(dna_fn, rna_fn):
    dna = pd.read_csv(
        dna_fn,
        sep="\t",
        names=["tile_id", "count"],
        skiprows=1,
    ).set_index("tile_id")["count"]

    rna = pd.read_csv(
        rna_fn,
        sep="\t",
        names=["tile_id", "count"],
        skiprows=1,
    ).set_index("tile_id")["count"]

    # dna[dna < 5] = np.nan
    dna[dna < 20] = np.nan
    rna_norm = (rna + 1) / (rna + 1).sum()
    dna_norm = (dna + 1) / (dna + 1).sum()
    ratios = np.log2(rna_norm / dna_norm).rename("logratio")

    tile_id = ratios.index.to_series().str.split("_").str[3].map(int)
    region_id = (
        ratios.index.to_series()
        .str.split("_")
        .map(lambda x: f"{x[0]}_{x[1]}_{x[2]}_{x[4]}_{x[5]}")
        .rename("region_id")
    )

    dff = pd.concat([region_id, tile_id, ratios], axis=1)
    logratio_df = dff.pivot(columns="tile_id", values="logratio", index="region_id")

    return logratio_df

# HepG2 compute logratios


In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign1_minP_mRNA_Rep1.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign2_minP_mRNA_Rep1.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.HepG2.minP.Rep1.tsv", sep="\t")

In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign1_minP_mRNA_Rep2.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign2_minP_mRNA_Rep2.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.HepG2.minP.Rep2.tsv", sep="\t")

In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_SV40P_Plasmid.counts"
)
rna_fn = "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign1_SV40P_mRNA_Rep1.counts"

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_SV40P_Plasmid.counts"
)
rna_fn = "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign2_SV40P_mRNA_Rep1.counts"

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.HepG2.SV40P.Rep1.tsv", sep="\t")

In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_SV40P_Plasmid.counts"
)
rna_fn = "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign1_SV40P_mRNA_Rep2.counts"

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_SV40P_Plasmid.counts"
)
rna_fn = "data/raw/Scaleup_counts_sequences/HEPG2/HepG2_ScaleUpDesign2_SV40P_mRNA_Rep2.counts"

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.HepG2.SV40P.Rep2.tsv", sep="\t")

# K562 compute logratios


In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign1_minP_mRNA_Rep1.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign2_minP_mRNA_Rep1.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.K562.minP.Rep1.tsv", sep="\t")

In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign1_minP_mRNA_Rep2.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_minP_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign2_minP_mRNA_Rep2.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.K562.minP.Rep2.tsv", sep="\t")

In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_SV40P_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign1_SV40P_mRNA_Rep1.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_SV40P_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign2_SV40P_mRNA_Rep1.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.K562.SV40P.Rep1.tsv", sep="\t")

In [None]:
# compute logratios
logratio_dfs = []
coord_dfs = []

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign1_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign1_SV40P_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign1_SV40P_mRNA_Rep2.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design1"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_fn = "data/raw/Scaleup_counts_sequences/coords_ScaleUpDesign2_hg19.txt"
dna_fn = (
    "data/raw/Scaleup_counts_sequences/DNACOUNTS/ScaleUpDesign2_SV40P_Plasmid.counts"
)
rna_fn = (
    "data/raw/Scaleup_counts_sequences/K562/K562_ScaleUpDesign2_SV40P_mRNA_Rep2.counts"
)

coord_df = pd.read_csv(coord_fn, sep="\t", names=["region_id", "chrom", "start", "end"])
coord_df["design"] = "Design2"
coord_dfs.append(coord_df)

logratio_df = get_logratios(dna_fn, rna_fn)
logratio_dfs.append(logratio_df)

coord_df = pd.concat(coord_dfs).set_index("region_id")
logratio_df = pd.concat(logratio_dfs)

coord_df.to_csv("data/processed/sharpr_20/coords.tsv", sep="\t")
logratio_df.to_csv("data/processed/sharpr_20/logratios.K562.SV40P.Rep2.tsv", sep="\t")