In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
meta_activity_map_file = "/data5/deepro/starrseq/papers/results/1_categorize_fragments_on_activity/data/meta_activity_map.csv"
abc_enhancer_to_target_file = "/data5/deepro/starrseq/papers/results/5_link_da_enhancers_to_de_genes/data/predictions/activity_from_starr_with_hic/EnhancerPredictionsFull.txt"
nearest_gene_file = "/data5/deepro/starrseq/papers/results/5_link_da_enhancers_to_de_genes/data/predictions/nearest_genes/closest.bed"
gtf_file = "/data5/deepro/starrseq/papers/results/5_link_da_enhancers_to_de_genes/data/genome_annot/parsed_gtf.tsv"
gene_de_dir = "/data5/deepro/starrseq/papers/results/4_compare_expression_ko_vs_wt/data/results/de"
link_save_dir = "/data5/deepro/starrseq/papers/results/5_link_da_enhancers_to_de_genes/data/da_enhancers_to_de_genes_links"

libraries = ["CC", "ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]

In [3]:
meta_activity_map_df = pd.read_csv(meta_activity_map_file)
abc_enhancer_to_target_df = pd.read_csv(abc_enhancer_to_target_file, sep="\t")
nearest_gene_df = pd.read_csv(nearest_gene_file, sep="\t", header=None)
gtf_df = pd.read_csv(gtf_file, sep="\t", header=None)

In [4]:
abc_enhancer_to_target_df["chrom_coord"] = abc_enhancer_to_target_df.chr + "_" + abc_enhancer_to_target_df.start.astype(str) + "_" + abc_enhancer_to_target_df.end.astype(str)
target_gene_id_to_gene_dict = dict(zip(gtf_df[3], gtf_df[4]))
abc_enhancer_to_target_df["TargetGene"] = abc_enhancer_to_target_df["TargetGene"].map(target_gene_id_to_gene_dict)
abc_enhancer_to_target_grouped_df = abc_enhancer_to_target_df.groupby("chrom_coord").agg({"TargetGene": lambda x: list(x)}).reset_index()
abc_enhancer_to_target_dict = dict(zip(abc_enhancer_to_target_grouped_df["chrom_coord"], abc_enhancer_to_target_grouped_df["TargetGene"]))
meta_activity_map_df["target_gene_abc"] = meta_activity_map_df.chrom_coord.map(abc_enhancer_to_target_dict)

In [5]:
nearest_gene_df["chrom_coord"] = nearest_gene_df[0] + "_" + nearest_gene_df[1].astype(str) + "_" + nearest_gene_df[2].astype(str)
nearest_gene_grouped_df = nearest_gene_df.groupby("chrom_coord").agg({7: lambda x: list(x)}).reset_index()
nearest_gene_dict = dict(zip(nearest_gene_grouped_df["chrom_coord"], nearest_gene_grouped_df[7]))
meta_activity_map_df["target_gene_nearest"] = meta_activity_map_df.chrom_coord.map(nearest_gene_dict)

In [6]:
def get_genes_pval_dict(de_file):
    de_df = pd.read_csv(de_file)
    de_df["gene_name"] = de_df.index.map(target_gene_id_to_gene_dict)
    return dict(zip(de_df.gene_name, de_df.padj))

In [7]:
for ko in libraries[1:]:
    print(f"comparing {ko} to CC....")
    # get the genes to pval dict
    de_gene_file = os.path.join(gene_de_dir, f"{ko}vsCC", "de_results.csv")
    de_gene_pval_dict = get_genes_pval_dict(de_gene_file)
    # get all the gained fragments :: enhancer in CC, not an enhancer in KO
    gained_df = meta_activity_map_df.loc[(meta_activity_map_df[f"CC_peak"]==0) & (meta_activity_map_df[f"{ko}_peak"]==1) & (meta_activity_map_df[f"{ko}_padj"]<0.01)]
    gained_df["target_gene_abc_exp"] = gained_df["target_gene_abc"].fillna("").map(lambda x: [de_gene_pval_dict[i] for i in x] if x else [])
    gained_df["target_gene_nearest_exp"] = gained_df["target_gene_nearest"].map(lambda x: [de_gene_pval_dict[i] for i in x] if x else [])
    gained_abc_df = gained_df.loc[gained_df.target_gene_abc_exp.apply(lambda x: any([i<0.01 for i in x]))]
    gained_nearest_df = gained_df.loc[gained_df.target_gene_nearest_exp.apply(lambda x: any([i<0.01 for i in x]))]
    gained_abc_df = gained_abc_df.loc[:, ["chrom_coord", "target_gene_abc", "target_gene_abc_exp"]].explode(["target_gene_abc", "target_gene_abc_exp"])
    gained_nearest_df = gained_nearest_df.loc[:, ["chrom_coord", "target_gene_nearest", "target_gene_nearest_exp"]].explode(["target_gene_nearest", "target_gene_nearest_exp"])
    gained_abc_df = gained_abc_df.loc[gained_abc_df.target_gene_abc_exp<0.01]
    gained_nearest_df = gained_nearest_df.loc[gained_nearest_df.target_gene_nearest_exp<0.01]
    os.makedirs(os.path.join(link_save_dir, ko), exist_ok=True)
    gained_abc_save_file = os.path.join(link_save_dir, ko, "gained_frag_abc_target_sde.csv")
    gained_nearest_save_file = os.path.join(link_save_dir, ko, "gained_frag_nearest_target_sde.csv")
    gained_abc_df.to_csv(gained_abc_save_file, index=False)
    gained_nearest_df.to_csv(gained_nearest_save_file, index=False)
    # get all the lost fragments :: enhancer in CC, not an enhancer in KO
    lost_df = meta_activity_map_df.loc[(meta_activity_map_df[f"CC_peak"]==1) & (meta_activity_map_df[f"{ko}_peak"]==0) & (meta_activity_map_df[f"{ko}_padj"]<0.01)]
    lost_df["target_gene_abc_exp"] = lost_df["target_gene_abc"].fillna("").map(lambda x: [de_gene_pval_dict[i] for i in x] if x else [])
    lost_df["target_gene_nearest_exp"] = lost_df["target_gene_nearest"].map(lambda x: [de_gene_pval_dict[i] for i in x] if x else [])
    lost_abc_df = lost_df.loc[lost_df.target_gene_abc_exp.apply(lambda x: any([i<0.01 for i in x]))]
    lost_nearest_df = lost_df.loc[lost_df.target_gene_nearest_exp.apply(lambda x: any([i<0.01 for i in x]))]
    lost_abc_df = lost_abc_df.loc[:, ["chrom_coord", "target_gene_abc", "target_gene_abc_exp"]].explode(["target_gene_abc", "target_gene_abc_exp"])
    lost_nearest_df = lost_nearest_df.loc[:, ["chrom_coord", "target_gene_nearest", "target_gene_nearest_exp"]].explode(["target_gene_nearest", "target_gene_nearest_exp"])
    lost_abc_df = lost_abc_df.loc[lost_abc_df.target_gene_abc_exp<0.01]
    lost_nearest_df = lost_nearest_df.loc[lost_nearest_df.target_gene_nearest_exp<0.01]
    lost_abc_save_file = os.path.join(link_save_dir, ko, "lost_frag_abc_target_sde.csv")
    lost_nearest_save_file = os.path.join(link_save_dir, ko, "lost_frag_nearest_target_sde.csv")
    lost_abc_df.to_csv(lost_abc_save_file, index=False)
    lost_nearest_df.to_csv(lost_nearest_save_file, index=False)

comparing ATF2 to CC....
comparing CTCF to CC....


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gained_df["target_gene_abc_exp"] = gained_df["target_gene_abc"].fillna("").map(lambda x: [de_gene_pval_dict[i] for i in x] if x else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gained_df["target_gene_nearest_exp"] = gained_df["target_gene_nearest"].map(lambda x: [de_gene_pval_dict[i] for i in x] if x else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

comparing FOXA1 to CC....
comparing LEF1 to CC....
comparing SCRT1 to CC....
comparing TCF7L2 to CC....
comparing 16P12_1 to CC....
