# Annotating
<img src="web_euplotid/lab_meeting_slides_Annotating.png" style="width: 1000px;">

# 1 Modified nucleosome localization data is used to color nodes
After assembling the Insulated Neighborhood skeleton it is key to be able to visualize the impact of Histone modifications on the local regulatory structure, therefore we simply color the nodes of the genomic graph by histone modifications in the given cell state of interest. 

In [None]:
#overlap bed file w/ graph and return list of overlapping nodes
def overlap_graph_bed(G, bed):
    os.system("sort-bed " + bed + " | bedops -e -1 " + "all_nodes_sorted.bed - > " + "overlapped_nodes.bed")
    overlapped_nodes = set()
    with open("overlapped_nodes.bed","r") as over_bed:
        for node in over_bed:
            arr = node.strip().split()
            overlapped_nodes.add(str(arr[0]) + ":" + str(arr[1]) + "-" + str(arr[2]))
    os.remove("overlapped_nodes.bed")
    return overlapped_nodes

#get classes of nodes using Histone ChIP-seq
h3k4me3_nodes = overlap_graph_bed(dna_int_graph, prom_peaks)
enh_nodes = overlap_graph_bed(dna_int_graph, enh_peaks)
ctcf_nodes = overlap_graph_bed(dna_int_graph, ctcf_peaks)

# 2 Transcription Factor (TF) binding data is used to train neural networks capable of identifying TFs at chromatin accessibility sites
Begin by training Convolutional Neural Networks (CNNs) based on all chip-seq and SELEX data for all TFs ever surveyed. The initial implementation of Euplotid uses pre-trained CNNs from [Deepbind](http://www.nature.com/nbt/journal/v33/n8/full/nbt.3300.html). These CNNs are able to identify the TFs which fall under each chromatin accessiblity peak, but in order to understand the peak as a whole Euplotid takes advantage of [Basset](http://genome.cshlp.org/content/early/2016/06/10/gr.200535.115.abstract) to train neural networks which are capable of predicting changes in chromatin accessibility. Basset is trained on all available chromatin accessibility data in ENCODE, DNAse of 180 different cell lines. Basset is therefore able to perform in-silico simulations to gauge the impact of a given mutation on the the complex as a whole, by combining this with the CNNs from Deepbind, we are able to make a prediction as to what factor is causing this change. 

# 3 Take all chromatin accessibility peaks within 50kb of nodes

In [None]:
def add_openRegions_predict(G, homo_gen, chain_file, chain_file2, TF_RBP_ids,open_peaks, target_node_name, in_level, openpeak2bassetpeak):
    #Function to add open chromatin accessibility peaks to graph and predict TF binding using DeepBind
    #how far can the chromatin accessibility be from the node to be considered?
    open_region_slop = 100000
    orig_edges = G.edges()
    with open(open_peaks,"r") as all_peaks:
        for peak in all_peaks:
            arr = peak.strip().split("|")
            if len(arr) > 1 and (arr[1] != "NA") and (abs(int(arr[2])) <= open_region_slop):
                dist2anchor = abs(int(arr[2]))
                arr_atac = arr[0].strip().split()
                node_arr = arr[1].strip().split()
                atac_node = arr_atac[0] + ":" + str(int(arr_atac[1])) + "-" + str(int(arr_atac[2]))
                open_size = float(arr_atac[3])
                anchor_node = node_arr[0] + ":" + str(int(node_arr[1])) + "-" + str(int(node_arr[2]))
                if anchor_node in G.nodes():
                    if in_level < 0:
                        deep_open_pred_out = deepbind_predict_tf_range(arr_atac[0],int(arr_atac[1]),int(arr_atac[2]),homo_gen, TF_RBP_ids, target_node_name)
                        #bass_open_pred_out = basset_predict_tf_range(arr_atac, homo_gen, target_node_name, openpeak2bassetpeak)
                    else:
                        deep_open_pred_out = ["","",""]
                    top_tf_names = deep_open_pred_out[0]
                    top_tf_probs = deep_open_pred_out[1]
                    top_tf_probs = [str(round(prob,2)) for prob in top_tf_probs]
                    top_tf_locs = deep_open_pred_out[2]
                    predict_tf_deepbind = ""
                    # Show top 15 TFs predicted under each chromatin accessibility peak
                    for tf in range(0,min([len(top_tf_names),15])):
                        predict_tf_deepbind += str(top_tf_names[tf]) + " [" + str(int(top_tf_locs[tf])) + "] :" + top_tf_probs[tf] + ", "
                    atac_mid = (int(arr_atac[1]) + int(arr_atac[2]))/2.0
                    #add Differentially Methylated Cytosine (DMC) data from NGSMethDB if human
                    if G.graph["species"] == "Human2":
                        DMC_node = get_NGSmethDMC(atac_node,chain_file,chain_file2)
                    else:
                        DMC_node = ""
                    top_tf_names=("|".join(top_tf_names))
                    top_tf_probs=("|".join(top_tf_probs))
                    filt_tf_open = top_tf_names[0:min([len(top_tf_names),15])]
                    G.add_node(atac_node, mid=atac_mid,name=atac_node, dist2anchor=dist2anchor,
                              deepbind_tf=predict_tf_deepbind,top_tf_names=top_tf_names, filt_tf_open = filt_tf_open,
                               top_tf_probs=top_tf_probs, DMC_node=DMC_node, open_size=open_size)
                    G.add_edge(anchor_node,atac_node,label="",weight=1.0)
    os.system("bedmap --echo --echo-map-id-uniq --multidelim , --bp-ovr 1 " + open_peaks + " " + encode_bed + " > " + target_node_name + "_tfbs.bed")
    with open(target_node_name + "_tfbs.bed","r") as open_tfbs_all:
        for node_tf in open_tfbs_all:
            arr = node_tf.strip().split("|")
            node_arr = arr[0].split()
            node_name = node_arr[0] + ":" + node_arr[1] + "-" + node_arr[2]
            if len(arr)>1 and node_name in G:
                G.node[node_name]["tfbs"] = arr[1].strip()
    #cleanup
    os.remove(target_node_name + "_tfbs.bed")
    
    #add predicted co-occurences
    for edge in orig_edges:
        left_tf = set()
        right_tf = set()
        left_tf_enc = set()
        right_tf_enc = set()
        #overlap top 15 TFs of each chromatin accessibility site as ordered by deepbind score
        for neigh in G[edge[0]]:
            if "top_tf_names" in G.node[neigh]:
                left_tf |= set(G.node[neigh]["top_tf_names"].split("|")[0:15])
            if "tfbs" in G.node[neigh] and "top_tf_names" in G.node[neigh]:
                left_tf_enc |= set(G.node[edge[0]]["tfbs"].strip().split(","))
        for neigh in G[edge[1]]:
            if "top_tf_names" in G.node[neigh]:
                left_tf |= set(G.node[neigh]["top_tf_names"].split("|")[0:15])
            if "tfbs" in G.node[neigh] and "top_tf_names" in G.node[neigh]:
                right_tf_enc |= set(G.node[edge[0]]["tfbs"].strip().split(","))
        overlapped_tf = left_tf.intersection(right_tf)
        overlapped_tf_enc = left_tf_enc.intersection(right_tf_enc)
        if len(overlapped_tf) > 0:
            over_names = ""
            for tf in overlapped_tf:
                over_names += tf + ", "
            G[edge[0]][edge[1]]["overlapped_tf"] = over_names
        
        if len(overlapped_tf_enc) > 0:
            over_names_enc = ""
            for tf in overlapped_tf_enc:
                over_names_enc += tf + ", "
            G[edge[0]][edge[1]]["overlapped_tf_enc"] = over_names_enc
        
        if len(left_tf)>0:
            G.node[edge[0]]["filt_tf_peak"] = "|".join(left_tf)
        if len(right_tf)>0:
            G.node[edge[1]]["filt_tf_peak"] = "|".join(right_tf)
            
        if len(left_tf_enc)>0:
            G.node[edge[0]]["tfbs_enc_all"] = "|".join(left_tf_enc)
        if len(right_tf_enc)>0:
            G.node[edge[1]]["tfbs_enc_all"] = "|".join(right_tf_enc)
            
    #remove long prediction of annotations
    for node in G.nodes():
        if "top_tf_names" in G.node[node]:
            G.node[node]["top_tf_names"] = ""
            G.node[node]["top_tf_probs"] = ""
    return G


# 4 Learned neural network can identify constituents of all chromatin accessibility peaks

In [None]:
def deepbind_predict_tf_range(chrom, start, end, homo_gen, TF_RBP_ids, target_node_name):
    size_window = 30
    stagger_window = 5
    deep_node_name = target_node_name.replace(":","_").replace("-","_")
    #filt TFs out w/ scores below this threshold
    tf_filt = 2.0
    open_seqs = list()
    for left_bound in range(int(start),int(end),stagger_window):
        open_seq = Seq.Seq(homo_gen.fetch(chrom,left_bound,left_bound+size_window))
        ref_req = SeqRecord(open_seq, chrom+":"+str(left_bound)+"-"+str(left_bound+size_window),"","")
        open_seqs.append(ref_req)
    open_seqs_fa = open(deep_node_name + "_open_anchor.fa", "w+")
    #get deepbind id to TF name mapping
    db_id2name = dict()
    with open(TF_RBP_ids) as db_dp:
        for db_id in db_dp:
            db_id2name[db_id.split("#")[0].strip()] = db_id.split("#")[1].strip()
    SeqIO.write(open_seqs, open_seqs_fa, "fasta")
    open_seqs_fa.close()
    #call deepbind and dump output
    os.system("/root/deepbind/deepbind " + TF_RBP_ids + " " +  deep_node_name + "_open_anchor.fa" + " > " + deep_node_name + "_deep_open.txt")
    deep_open_out = open(deep_node_name + "_deep_open.txt","r+")
    header = deep_open_out.readline().strip().split()
    deep_open_prob = np.loadtxt(deep_open_out,  ndmin = 2)
    deep_open_out.close()
    #smooth over overlapping windows and flatten array
    deep_open_prob_smooth = list()
    for tf in range(0,np.shape(deep_open_prob)[1]):
        #deep_open_prob_smooth.extend(moving_average(deep_open_prob[:,tf],n=4))
        deep_open_prob_smooth.extend(pd.rolling_max(deep_open_prob[:,tf],4)[3:len(deep_open_prob[:,tf])])
    deep_probs_ix_sort = np.array(deep_open_prob_smooth).argsort()[::-1]
    tf_names = [db_id2name[header[ix%np.shape(deep_open_prob)[1]]] 
                for ix in deep_probs_ix_sort if deep_open_prob_smooth[ix] >= tf_filt]
    tf_probs = [deep_open_prob_smooth[ix] for ix in deep_probs_ix_sort 
                if deep_open_prob_smooth[ix] >= tf_filt]    
    tf_loc =  [(np.floor(ix/np.shape(deep_open_prob)[1]) + int(start)) 
               for ix in deep_probs_ix_sort if deep_open_prob_smooth[ix] >= tf_filt]
    os.remove(deep_node_name + "_deep_open.txt")
    os.remove(deep_node_name + "_open_anchor.fa")
    return [tf_names,tf_probs,tf_loc]

def basset_predict_tf_range(open_region, homo_gen, target_node_name, openpeak2bassetpeak):
    #trained DNAse peak model
    model_file = "/input_dir/pretrained_model.th"
    #Sequences stored in HDF5 format of encode DNAse peaks
    seqs_file = "/input_dir/encode_roadmap.h5"
    #table of DNAse target BED files used to train model
    targets_file = "/root/Basset/tutorials/sad_eg/sample_beds.txt"
    #expand region to at least 600 bp, default size model was trained on
    homo_gen = pysam.FastaFile("/input_dir/hg19.fa")
    arr = re.split(r"[-:]",open_region)
    chrom = arr[0]
    start = arr[1]
    end = arr[2]
    stagger_window = 10
    size_window = 600
    open_seqs = list()
    for left_bound in range(int(start),int(end),stagger_window):
        open_seq = Seq.Seq(homo_gen.fetch(chrom,left_bound,left_bound+size_window))
        ref_req = SeqRecord(open_seq, chrom+":"+str(left_bound)+"-"+str(left_bound+size_window),"","")
        open_seqs.append(ref_req)
    with open("basset_open_anchor.fa", "w+") as open_seqs_fa:
        SeqIO.write(open_seqs, open_seqs_fa, "fasta")
    cmd = ("/root/Basset/src/seq_hdf5.py -r -c -v " + str(len(open_seqs)) + " -t " + str(len(open_seqs)) 
           + " basset_open_anchor.fa /input_dir/encode_roadmap_act.txt open_region.h5")
    subprocess.call(cmd, shell=True)
    cmd = ("/root/Basset/src/basset_motifs.py -s " + str(len(open_seqs)) + " -t -o motifs_out + " 
           + model_file + " open_region.h5")
    subprocess.call(cmd, shell=True)


# 5 Take all SNPs/CNVs within footprint from dbSNP and annotate for:
* eQTL in 32 tissues (FastQTL+GTeX)
* Ease of CRISPR editing (GT-Scan2)
* Results of over 2,000 GWAS studies (GRASP)
* 667 Methylomes (NGSMethDB)
* Estimate of deleteriousness (CADD Score)

In [None]:
def get_NGSmethDMC(open_region,chain_file,chain_file2):
    open_region_hg38 =  liftover_chain(open_region,chain_file)
    if open_region_hg38 is None:
        return ""
    ngs_methdb = "http://bioinfo2.ugr.es:8888/NGSmethAPI/hg38/" + open_region_hg38
    meth_db_http = requests.get(ngs_methdb)
    meth_db_out = ""
    if meth_db_http.ok:
        out_meth = meth_db_http.json()
        if len(out_meth)>0:
            for loc in out_meth:
                if "diffmeth_cg" in loc:
                    tissues_meth = set()
                    pval_meth = list()
                    for samp_a in loc["diffmeth_cg"]:
                        for samp_b in loc["diffmeth_cg"][samp_a]:
                            if "methylKit" in loc["diffmeth_cg"][samp_a][samp_b] and (float(loc["diffmeth_cg"][samp_a][samp_b]["methylKit"])<.01):
                                pval_meth.append(float(loc["diffmeth_cg"][samp_a][samp_b]["methylKit"]))
                                tissues_meth |= set(samp_a.split("#"))
                                tissues_meth |= set(samp_b.split("#"))
                            if "MOABS_sim" in loc["diffmeth_cg"][samp_a][samp_b] and (float(loc["diffmeth_cg"][samp_a][samp_b]["MOABS_sim"])<.01):
                                pval_meth.append(float(loc["diffmeth_cg"][samp_a][samp_b]["MOABS_sim"]))
                                tissues_meth |= set(samp_a.split("#"))
                                tissues_meth |= set(samp_b.split("#"))
                    if len(tissues_meth)>0:
                        arr = re.split(r"[-:]",open_region)
                        hg38_pos = arr[0] + ":" + str(loc["pos"]) + "-" + str(loc["pos"]+1)
                        hg19_pos = liftover_chain(hg38_pos,chain_file)
                        meth_db_out += "DMC:(" + hg19_pos + ") number_tissues:" + str(len(tissues_meth)) \
                        + " min_pval:" + str(round(min(pval_meth),9)) + "\n"
    return meth_db_out
def add_variants_predict(G, homo_gen, chain_file, TF_RBP_ids, tissue_type, target_node_name,out_dir):
    #Function to add variants and predict their effect using DeepBind
    in_genes = G.node[target_node_name]["in_name"].split(",")
    #get target gene for eQTL analysis
    mg = mygene.MyGeneInfo()
    target_gene_out = mg.getgenes(in_genes, fields="ensembl", species="human", fetch_all=True)
    ensembl_target = ""
    for gene in target_gene_out:
        if "ensembl" in gene:
            ensembl_target = gene["ensembl"]["gene"]
    # flag for if any SNPs where added to the Graph
    add_any_SNPs = False
    #Save vcf output
    basset_name_out = os.getcwd() + "/" + target_node_name.replace(":","_").replace("-","_") + "_basset"
    if not os.path.exists(basset_name_out):
        os.makedirs(basset_name_out)
    open_vcf = open(basset_name_out + "/open_variants.vcf","w+")
    #find all variants that fall within these open peaks using MyVariant.info 
    mv = myvariant.MyVariantInfo()
    for node in G.nodes(data=True):
        if "dist2anchor" in node[1]:
            #Pull all variants within chromatin accessibility peak
            all_snps = mv.query(node[0], fetch_all=True)
            for snp in all_snps:
                if "dbsnp" not in snp:
                    continue
                add_any_SNPs = True      
                #Add CNVs/SNVs to graph
                G.add_node(snp["_id"],gen_start=snp["hg19"]["start"], all_ref = str(snp["dbsnp"]["ref"]),
                           all_alt = str(snp["dbsnp"]["alt"]),gen_end=snp["hg19"]["end"], gen_chrom="chr"+str(snp["chrom"]), 
                           name=snp["_id"])
                G.add_edge(snp["_id"], node[0],label="",weight=1.0)
                #print to VCF
                open_vcf.write("chr"+str(snp["chrom"]) + "\t" + str(snp["hg19"]["start"]) + "\t" + snp["_id"] +
                               "\t" + str(snp["dbsnp"]["ref"]) + "\t" + str(snp["dbsnp"]["alt"]) + "\n")
                
                snp_loc = str(G.node[snp["_id"]]["gen_chrom"]) + ":" + str(G.node[snp["_id"]]["gen_start"]) + "-" + str(G.node[snp["_id"]]["gen_end"])
                #add Differentially Methylated Cytosine (DMC) data from NGSMethDB
                if G.graph["species"] == "Human2":
                    DMC_var = get_NGSmethDMC(snp_loc,chain_file,chain_file2)
                    G.node[snp["_id"]]["DMC_node"] = DMC_var
                if ("cadd" in snp):
                    G.node[snp["_id"]]["cadd"] = snp["cadd"]["rawscore"]
                #add rsid if present and add GTEX eQTL pval and if target gene has Ensembl ID
                if (ensembl_target is not None) and ("dbsnp" in snp) and ("rsid" in snp["dbsnp"]):
                    serv = "http://rest.ensembl.org/eqtl/variant_name/homo_sapiens/"
                    ext = snp["dbsnp"]["rsid"].strip() + "?content-type=application/json;statistic=p-value;stable_id=" + ensembl_target + ";tissue=" + tissue_type 
                    r = requests.get(serv + ext, headers={ "Content-Type" : "application/json"})
                    if r.ok:
                        ret_val = r.json()
                        if isinstance(ret_val,list) and (len(ret_val)>0):
                            eqtl_pval = ret_val[0]["value"]
                            G.node[snp["_id"]]["gtex_eqtl_pval"] = round(eqtl_pval,6)
                    G.node[snp["_id"]]["rsid"] = snp["dbsnp"]["rsid"]
                #add GRASP phenotype and PMID information for SNP
                if ("grasp" in snp) and ("publication" in snp["grasp"]):
                    pheno_gwas = ""
                    pmid_gwas = ""
                    #single or multiple publications w/ SNP?
                    if isinstance(snp["grasp"]["publication"], list):
                        for pub in snp["grasp"]["publication"]:
                            if isinstance(pub["paper_phenotype_description"],list):
                                pheno_gwas += ",".join([str(pheno) + ", " for pheno in pub["paper_phenotype_description"]]) + ", "
                            else:
                                pheno_gwas += pub["paper_phenotype_description"] + ","
                            if isinstance(pub["pmid"],list):
                                pmid_gwas += ",".join([str(pmid) for pmid in pub["pmid"]]) + ", "
                            else:
                                pmid_gwas += str(pub["pmid"]) + ","
                    else:
                        pub = snp["grasp"]["publication"]
                        if isinstance(pub["paper_phenotype_description"],list):
                            pheno_gwas += "".join([str(pheno) + ", " for pheno in pub["paper_phenotype_description"]]) + ", "
                        else:
                            pheno_gwas += pub["paper_phenotype_description"]
                        if isinstance(pub["pmid"],list):
                            pmid_gwas += ",".join([str(pmid) for pmid in pub["pmid"]])
                        else:
                            pmid_gwas += str(pub["pmid"])
                    G.node[snp["_id"]]["grasp_pheno"] = pheno_gwas.replace(" ","_")
                    G.node[snp["_id"]]["grasp_pmid"] = pmid_gwas.replace(" ","_")
    open_vcf.close()
    #predict effect of all variation added to graph using DeepBind and Basset
    if add_any_SNPs:
        print("Deepbind SNP prediction of: " + target_node_name)
        G = deepbind_predict_SNPs(G, homo_gen, TF_RBP_ids,target_node_name)
        print("Basset SNP prediction of: " + target_node_name)
        G = add_basset_sad_sat(G, homo_gen, target_node_name,out_dir)
    try:
        os.remove(basset_name_out + "open_variants.vcf")
    except OSError:
        pass
    shutil.rmtree(basset_name_out)
    return G

# 6 Neural network is able to predict the effect of mutations within CREs by performing in-silico mutational screen 

In [None]:
def deepbind_predict_SNPs(G, homo_gen, TF_RBP_ids,target_node_name):                        
    #predict effect of CNVs/SNPs which were added to Insulated Neighborhood
    flank_snp = 15
    mut_seqs = list()
    ref_seqs = list()
    snp_names = list()
    mut_snps = open(target_node_name + "_mut_snps.fa", "w+")
    ref_snps = open(target_node_name + "_ref_snps.fa", "w+")
    #get deepbind id to TF name mapping
    db_id2name = dict()
    with open(TF_RBP_ids) as db_dp:
        for db_id in db_dp:
            db_id2name[db_id.split("#")[0].strip()] = db_id.split("#")[1].strip()
    for node in G.nodes(data=True):
        if "all_ref" in node[1]:
            chrom = node[1]["gen_chrom"]
            snp_loc = node[1]["gen_start"]
            snp_names.append(node[0])
            #get 5 tiled sequences and average the probability across them
            for offset in range(-2,3):
                seq_start = int((snp_loc-flank_snp)+(offset*5.0))
                seq_end = int((snp_loc+flank_snp+1)+(offset*5.0))
                snp_seq = Seq.Seq(homo_gen.fetch(chrom,seq_start,seq_end))
                ref_seq = snp_seq.tomutable()
                ref_req = SeqRecord(ref_seq, node[0],"","")
                ref_seqs.append(ref_req)
                mut_seq = snp_seq.tomutable()
                alt_seq = node[1]["all_alt"]
                if len(alt_seq) > 1:
                    max_ix = min(len(mut_seq),(flank_snp-1+len(alt_seq)))
                    mut_seq[flank_snp-1:max_ix] = alt_seq
                else:
                    mut_seq[flank_snp-1] = node[1]["all_alt"]
                mut_req = SeqRecord(mut_seq, node[0],"","")
                mut_seqs.append(mut_req)
    #write sorrounding sequences of SNP to file and reopen again for deepbind
    SeqIO.write(mut_seqs, mut_snps, "fasta")
    SeqIO.write(ref_seqs, ref_snps, "fasta")
    mut_snps.close()
    ref_snps.close()
    #call deepbind and dump output
    os.system("/root/deepbind/deepbind " + TF_RBP_ids + " " +  target_node_name + "_mut_snps.fa" + " > " + target_node_name + "_deep_mut.txt")
    os.system("/root/deepbind/deepbind " + TF_RBP_ids + " " +  target_node_name + "_ref_snps.fa" + " > " + target_node_name + "_deep_ref.txt")
    #open deepbind output for ref and mut
    deep_mut_out = open(target_node_name + "_deep_mut.txt","r+")
    deep_ref_out = open(target_node_name + "_deep_ref.txt","r+")
    #peel header off
    header = deep_mut_out.readline().strip().split() 
    header = deep_ref_out.readline().strip().split() 
    #suck deebind output into numpy array
    deep_mut_prob = np.loadtxt(deep_mut_out,  ndmin = 2)
    deep_ref_prob = np.loadtxt(deep_ref_out,  ndmin = 2)
    deep_mut_out.close()
    deep_ref_out.close()
    for snp_num in range(0,len(snp_names)):
        #compare scores of TF binding according to Deepbind paper
        deep_ref_prob_cut = np.mean(deep_ref_prob[(snp_num*5):((snp_num+1)*5),:],axis=0)
        deep_mut_prob_cut = np.mean(deep_mut_prob[(snp_num*5):((snp_num+1)*5),:],axis=0)
        max_diffs = np.maximum.reduce([np.zeros(len(deep_ref_prob_cut)),deep_ref_prob_cut,deep_mut_prob_cut])      
        deep_cut_prob = (deep_ref_prob_cut - deep_mut_prob_cut) * max_diffs
        max_shown = min(5, sum(abs(deep_cut_prob)>=3))
        if max_shown > 0:
            temp_sort = np.array(abs(deep_cut_prob)).argsort()[::-1]
            deep_probs_ix_sort = [ix for ix in temp_sort if np.isfinite(deep_cut_prob[ix])][0:max_shown]
            out_prob_name = ""
            top_tf_names = [db_id2name[header[ix]] for ix in deep_probs_ix_sort]
            top_tf_probs = [deep_cut_prob[ix] for ix in deep_probs_ix_sort]
            for x in zip(top_tf_names,top_tf_probs):
                out_prob_name += str(x[0]) + ":" + str(round(x[1],2)) + ", "
            G.node[snp_names[snp_num]]["deep_score"] = out_prob_name
    os.remove(target_node_name + "_deep_mut.txt")
    os.remove(target_node_name + "_deep_ref.txt")
    os.remove(target_node_name + "_mut_snps.fa")
    os.remove(target_node_name + "_ref_snps.fa")
    return G

def add_basset_sad_sat(G, homo_gen, target_node_name,out_dir):
    basset_out_name = out_dir + "/" + target_node_name.replace(":","_").replace("-","_") + "_basset"
    target_in_name = dna_int_graph.node[target_node_name]["in_name"]
    #trained DNAse peak model
    model_file = "/input_dir/pretrained_model.th"
    #Sequences stored in HDF5 format of encode DNAse peaks
    seqs_file = "/input_dir/encode_roadmap.h5"
    #table of DNAse target BED files used to train model
    targets_file = "/root/Basset/tutorials/sad_eg/sample_beds.txt"
    targetID_celltype = dict()
    count=0
    with open(targets_file) as id2type:
        for c_type in id2type:
            arr = c_type.strip().split()
            targetID_celltype[arr[0]] = count
            count += 1
    cmd = ("/root/Basset/src/basset_sad.py -f " + (homo_gen.filename).decode("utf-8") + " -l 600 -o " + basset_out_name + " -t " + targets_file + " " +  model_file + " " + basset_out_name + "/open_variants.vcf")
    print(cmd)
    os.system(cmd)
    #Pick SNP maximizing for sum of Delta SAD across all cell types in trained model
    sad_table = pd.read_table(basset_out_name + "/sad_table.txt", delim_whitespace=True, header = 0)
    sad_table_dense = sad_table.pivot_table(index='rsid', columns='target', values='pred')
    serial_sad_table = pickle.dumps(sad_table_dense, protocol=0)
    G.graph["sad_table"] = serial_sad_table
    sad_table["sad_abs"] = abs(sad_table["pred"])
    sad_rsid_table = sad_table.groupby("rsid").sum()
    sad_rsid_table = sad_rsid_table.sort_values(by="sad_abs",ascending=False)
    #Store SAD in each SNP node
    top_snp_names = sad_rsid_table.index
    for snp_index, snp_row in sad_rsid_table.iterrows():
        G.node[snp_index]["sad_abs_sum"] = snp_row["sad_abs"]
        G.node[snp_index]["sad_pred"] = ""
    for sad_index, sad_row in sad_table.iterrows():
        G.node[sad_row["rsid"]]["sad_pred"] += str(round(sad_row["pred"],4)) + ","
    target_sad_name = (top_snp_names[0].replace(":","_")).replace(">","_")
    top_snp_sad_abs_sum = sad_rsid_table["sad_abs"] 
    top_snp_sad_profile = sad_table[sad_table["rsid"].str.contains(top_snp_names[0])]
    top_snp_ix = top_snp_sad_profile["sad_abs"].argmax()
    top_snp_sad_table = top_snp_sad_profile.ix[top_snp_ix] 
    top_snp_ctype_target = targetID_celltype[top_snp_sad_table["target"]]
    top_snp_node = G.node[top_snp_names[0]]
    with open(basset_out_name + "/top_variant.vcf","w") as filt_out:
        filt_out.write(str(top_snp_node["gen_chrom"]) + "\t" + str(top_snp_node["gen_start"]) + "\t" + top_snp_names[0] + 
                        "\t" + top_snp_node["all_ref"] + "\t" + top_snp_node["all_alt"] + "\n")
    #flag top SNP in open peak
    G.node[top_snp_names[0]]["top_open_snp"] = True
    #Perform in-silico mutagenesis on top SNP to generate output PDFs
    cmd = ("/root/Basset/src/basset_sat_vcf.py -f " + (homo_gen.filename).decode("utf-8") + " -t " + str(top_snp_ctype_target) + " -o " + basset_out_name + 
           " " + model_file + " " + basset_out_name + "/top_variant.vcf")
    os.system(cmd)
    #Save output png & cleanup
    #tag in JSON file too
    G.node[top_snp_names[0]]["sad_pdf"] = target_in_name + "_sad_heat.png"
    out_pdfs = [pdf for pdf in glob.glob(basset_out_name+"/*heat.png") if target_sad_name in pdf]
    label_snp = ["ref","alt"]
    count = 1
    for pdf in out_pdfs:
        pdf_sat = pdf.split("/")
        G.node[top_snp_names[0]]["sad_mut_"+label_snp[count%2]] = str(pdf_sat[len(pdf_sat)-1])
        count += 1
        if os.path.isfile(pdf):
            os.rename(pdf, out_dir+"/"+pdf_sat[len(pdf_sat)-1])
    return G