In [1]:
import os
import pybedtools
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import f_oneway, fisher_exact
from scipy.stats.contingency import odds_ratio



In [2]:
def get_roi_depth(filtered_bam, roi_sorted_bed, bed_out):
    pybedtools.helpers.set_tempdir("/data7/deepro/tmp/")
    roi = pybedtools.BedTool(roi_sorted_bed).sort()
    try:
        c = roi.coverage(filtered_bam, sorted=True)
    except Exception as e:
        c = roi.coverage(filtered_bam)
    os.makedirs(os.path.dirname(bed_out), exist_ok=True)
    c.moveto(bed_out)
    pybedtools.helpers.cleanup(verbose=False, remove_all=True)
    return


def convert_pred_to_bed(filename, savefile):
    df = pd.read_csv(filename, header=None, names=["ypred", "ytarget", "chrm", "start", "end"])
    df.loc[:, ["chrm", "start", "end"]].to_csv(savefile, sep="\t", header=None, index=False)
    return df

In [3]:
root_dir = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data"
libs = ["CC", "ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]

data_dict = {
    "lib": [], "hcv_hcf_resnet": [], "hcv_lcf_resnet":[], "hcv_hcf_linear":[], "hcv_lcf_linear": [], 
    "fe_test": [], "or": [], "ci_low": [], "ci_high": []
    }
for lib in libs:
    resnet_file = os.path.join(root_dir, lib, "resnet_mlp.csv.gz")
    roi_file = os.path.join(root_dir, lib, "resnet_mlp.bed")
    bam_file = f"/data5/deepro/starrseq/papers/reproducibility/2_dedup_align_filter/data/in_house/filtered/{lib}/{lib}.bam"
    bed_out = os.path.join(root_dir, lib, "resnet_mlp_w_coverage.bed")
    linear_file = os.path.join(root_dir, lib, "linear", "homer_linear.csv.gz")
    # get coverage of ROI
    resnet_df = convert_pred_to_bed(resnet_file, roi_file)
    get_roi_depth(bam_file, roi_file, bed_out)
    cov_df = pd.read_csv(
        bed_out,
        header=None,
        names=["chrm", "start", "end", "coverage"],
        usecols=[0,1,2,3],
        sep="\t"
    )
    # add coverage to resnet predictions
    resnet_df = cov_df.merge(resnet_df, on=["chrm", "start", "end"])
    resnet_df["cov_decile"] = pd.qcut(resnet_df.coverage, q=10, labels=False)
    linear_df = pd.read_csv(
        linear_file,
        header=None, names=["ypred", "ytarget", "chrm", "start", "end"]
        )
    # add coverage to linear predictions
    linear_df = cov_df.merge(linear_df, on=["chrm", "start", "end"])
    linear_df["cov_decile"] = pd.qcut(linear_df.coverage, q=10, labels=False)
    high_conf_high_cov_resnet = len(resnet_df.loc[(resnet_df.cov_decile==9)&(resnet_df.ypred>0.9)])
    low_conf_high_cov_resnet = len(resnet_df.loc[(resnet_df.cov_decile==9)&(resnet_df.ypred<0.9)])
    high_conf_high_cov_linear = len(linear_df.loc[(linear_df.cov_decile==9)&(linear_df.ypred>0.9)])
    low_conf_high_cov_linear = len(linear_df.loc[(linear_df.cov_decile==9)&(linear_df.ypred<0.9)])
    contingency_table = [
        [high_conf_high_cov_resnet, low_conf_high_cov_resnet],
        [high_conf_high_cov_linear, low_conf_high_cov_linear]
    ]
    res_fisher = fisher_exact(contingency_table)
    res_odds = odds_ratio(contingency_table)
    res_ci = res_odds.confidence_interval(confidence_level=0.95)
    data_dict["lib"].append(lib)
    data_dict["hcv_hcf_resnet"].append(high_conf_high_cov_resnet)
    data_dict["hcv_lcf_resnet"].append(low_conf_high_cov_resnet)
    data_dict["hcv_hcf_linear"].append(high_conf_high_cov_linear)
    data_dict["hcv_lcf_linear"].append(low_conf_high_cov_linear)
    data_dict["fe_test"].append(res_fisher.pvalue)
    data_dict["or"].append(res_odds.statistic)
    data_dict["ci_low"].append(res_ci.low)
    data_dict["ci_high"].append(res_ci.high)
    # all region save file
    save_file = os.path.join(root_dir, lib, "cov_conf_compare.csv.gz")
    all_df = resnet_df.merge(linear_df, on=["chrm", "start", "end", "coverage", "ytarget", "cov_decile"], suffixes=("_resnet", "_linear"))
    all_df.to_csv(save_file, index=False)
    
all_stat_save_file = os.path.join(root_dir, "tables", "cov_conf_compare.csv")
df = pd.DataFrame(data_dict)
df.to_csv(all_stat_save_file, index=False)