# Assembling
<img src="../docs/web_euplotid/lab_meeting_slides_Assembling.png" style="width: 1000px;">
Determining the right INs can be easy by eye but defining and performing this task computationally is very difficult.

# 1 We can define a genomic graph with DNA interactions as Edges and anchor sites as Nodes

In [4]:
import networkx as nx
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from IPython.display import IFrame
import json
from networkx.readwrite import json_graph
import re
init_notebook_mode(connected=True)
import plotly.graph_objs as go
dna_ints = "/Users/dborgesr/Documents/chia_origami_ints/primed_.7_origami.bedpe"
chrom_filt = "chr16"
#initiate genome graph
dna_int_graph = nx.Graph(style="filled")
#Set genomewide attributes
dna_int_graph.graph["species"] = "Homo Sapiens"
dna_int_graph.graph["genome_version"] = "hg19"
dna_int_graph.graph["tissue_type"] = "hESC"
#Load DNA interactions
with open(dna_ints) as dna_ints_iter:
    for dna_int in dna_ints_iter:
        arr = dna_int.split()
        x = str(arr[0]) + ":" + str(arr[1]) + "-" + str(arr[2])
        mid_x = (int(arr[1])+int(arr[2]))/2.0
        y = str(arr[3]) + ":" + str(arr[4]) + "-" + str(arr[5])
        mid_y = (int(arr[4])+int(arr[5]))/2.0
        if (arr[0] == chrom_filt) and (arr[3] == chrom_filt):
            dna_int_graph.add_edge(x,y,label=1, capacity = 1, weight=float(arr[6]))
            dna_int_graph.node[x]["color"] = "rgb(174,183,180)"
            dna_int_graph.node[y]["color"] = "rgb(174,183,180)"
axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title="")
layout = go.Layout(
         width=600,
         height=600,
         xaxis=go.XAxis(axis),
         yaxis=go.YAxis(axis),
         showlegend=False,
         scene=go.Scene(
         xaxis=go.XAxis(axis),
         yaxis=go.YAxis(axis),
         zaxis=go.ZAxis(axis)),
    margin=go.Margin(l=0,
                     r=0,
                     b=0,
                     t=0),
    hovermode="closest")

#plot a specific range
dna_range = "chr16:0-90338345"
range_split = re.split(r"[-:]",dna_range)
#make edge trace and annotations
#get subgraph of nodes within range
sub_graph = nx.Graph(style="filled")
for node in dna_int_graph.nodes():
    node_split = re.split(r"[-:]",node)
    if (node_split[1]>range_split[1]) and (node_split[2]<range_split[2]): 
        sub_graph.add_node(node, color="rgb(174,183,180)")
        sub_graph.add_edges_from(dna_int_graph.edges(node))
all_position = nx.spring_layout(sub_graph,dim=2)  
#make node trace
traceN = go.Scatter(x=[], y=[], mode="markers", text=[],marker=go.Marker(color=[],size=[],opacity=[]))
traceN["name"] = ""
traceN["hoverinfo"] = "text"
for all_node in sub_graph.nodes(data=True):
    text_node = all_node[0]
    traceN["text"].append(text_node.replace("\n","<br>"))
    traceN["marker"]["color"].append("grey")
    traceN["marker"]["size"].append(20)
    traceN["marker"]["opacity"].append(1)
    traceN["x"].append(all_position[all_node[0]][0])
    traceN["y"].append(all_position[all_node[0]][1])
    
traceE = go.Scatter(x=[], y=[], mode="lines", hoverinfo = "none")
traceE["name"] = ""
traceE["line"]["width"] = 1
for edge in sub_graph.edges(data=True):
    traceE["x"] += [all_position[edge[0]][0],all_position[edge[1]][0], None]
    traceE["y"] += [all_position[edge[0]][1],all_position[edge[1]][1], None]

data = go.Data([traceE, traceN])
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

# 2 Starting “seed” promoters are picked by RNA-Seq

In [None]:
  #Sort nodes by FPKM for louvian
    node_fpkm = dict()
    start_comm = dict()
    for node in dna_int_graph.nodes(data=True):
        if ("name" in node[1]):
            node_fpkm[node[0]] = float(node[1]["fpkm"])
        else:                             
            node_fpkm[node[0]] = 0.0
    sorted_nodes = sorted(node_fpkm.keys(), key=lambda k: node_fpkm[k], reverse=True)
    node_count = 0
    for node in sorted_nodes:
        start_comm[node] = node_count
        node_count += 1

# 3 Crawling out of seed nodes, add node to Insulated Neighborhood (IN) if modularity increases and is contained within a CTCF-CTCF loop

# 4 Reach modularity equlibrium, where no more swapping of nodes between INs increases the modularity

In [None]:
#Set up CTCF-CTCF loop bed
ctcf_loops = open("ctcf_loops.bed","w")
for edge in dna_int_graph.edges(ctcf_nodes):
    l_node = re.split(r"[-:]",edge[0])
    r_node = re.split(r"[-:]",edge[1])
    if l_node[0] == r_node[0]:
        bounds = [int(l_node[1]),int(l_node[2]),int(r_node[1]),int(r_node[2])]
        ctcf_loops.write(l_node[0] + "\t" + str(int(min(bounds)))+"\t"+str(int(max(bounds))+1) + 
                         "\t" + edge[0] + ";" + edge[1] + "\n")
ctcf_loops.close()
os.system("sort-bed ctcf_loops.bed > t.bed; mv t.bed ctcf_loops.bed")   

#Checking resolution parameter
print("Checking resolution parameter against CTCF-CTCF loop")
size_sum = list()
size_avg = list()
res_run = list()
min_comm_nodes = dict()
max_comm_nodes = dict()
perc_CC = list()
target2partition = dict()
target2maxres = dict()
ctcf2bounds = dict()
num_res_steps = 20

#dict of lists w/ sizes of community of target gene at all resolutions
comm_target_lengths = dict()
for i in np.logspace(-5,-1,num_res_steps):
    comm_out_name = out_dir + "min_max_communities_res_genes" + str(i) + ".bed"
    cur_comm_limits = open(comm_out_name, "w")
    res_run.append(i)
    dendogram_com = community.generate_dendrogram(dna_int_graph, resolution=i, part_init=start_comm)
    dna_int_comm = community.partition_at_level(dendogram_com, 0)
    size2comm = Counter(dna_int_comm.values())
    size_comms = [size for size in size2comm.values()]
    size_sum.append(len(set(dna_int_comm.values())))
    size_avg.append(sum(size_comms)/(1.0*len(size_comms)))
    #assess quality of communities generated
    #iterate over dna_int_comm keys and add each start/end to set stored in dict of comm --> min and comm-->max
    inv_map = {}
    for k, v in dna_int_comm.items():
        inv_map[v] = inv_map.get(v, [])
        inv_map[v].append(k)
    for comm, nodes in inv_map.items():
        #only print communities w/ genes in them
        gene_nodes = [node for node in nodes if "gene" in dna_int_graph.node[node]]
        target_overlap = [node for node in target_nodes if node in nodes]
        #get max and min of neighborhoods and write to disk
        if (len(gene_nodes)>0) and len(target_overlap)>0:
            if target_overlap[0] not in comm_target_lengths:
                comm_target_lengths[target_overlap[0]] = list()
            comm_target_lengths[target_overlap[0]].append(str(len(nodes)))
            ref_node_arr = re.split(r"[-:]",nodes[0])
            min_comm = 1e10
            max_comm = 0
            for node in nodes:
                node_arr = re.split(r"[-:]",node)
                if node_arr[0] == ref_node_arr[0]:
                    if (min(int(node_arr[1]),int(node_arr[2])) <= min_comm):
                        min_comm = min(int(node_arr[1]),int(node_arr[2]))
                    if (max(int(node_arr[1]),int(node_arr[2])) >= max_comm):
                        max_comm = max(int(node_arr[1]),int(node_arr[2]))
            in_name_all_genes = ",".join([dna_int_graph.node[target_gene]["name"] for target_gene in target_overlap])
            for target_gene in target_overlap:
                dna_int_graph.node[target_gene]["in_name"] = in_name_all_genes
                dna_int_graph.node[target_gene]["in_min"] = min_comm
                dna_int_graph.node[target_gene]["in_max"] = max_comm
            cur_comm_limits.write(ref_node_arr[0] + "\t" + str(min_comm) + "\t" + str(max_comm+1) + "\t" + target_overlap[0] + "\n")
    cur_comm_limits.close()
    os.system("sort-bed " + comm_out_name + " > t.bed; mv t.bed " + comm_out_name)
    #count how many of target communities fell withing CTCF - CTCF loop
    os.system("bedops -e -1 ctcf_loops.bed " + comm_out_name + " | bedmap --echo --multidelim \"_\" --echo-map-id-uniq  --fraction-ref .8 " + comm_out_name + " - > target_comm_inCC.bed")
    contained = 0.0
    for line in open("target_comm_inCC.bed"):
        line_arr = line.strip().split("|")
        part_arr = line_arr[0].strip().split()
        if len(line_arr)>1 and line_arr[1] != "":
            loop_arr = line_arr[1].strip().split("_")
            weights_cc = np.argmax([dna_int_graph.get_edge_data(*(loop.split(";")))["weight"] for loop in loop_arr])
            #Only output strongest CTCF bounding loop
            ctcf2bounds[part_arr[3]] = [(loop.split(";")[0],loop.split(";")[1]) for loop in loop_arr][weights_cc]
            contained += 1.0
            target2partition[part_arr[3]] = dendogram_com
            target2maxres[part_arr[3]] = i
    perc_CC.append((contained/len(comm_target_lengths))*100.0)
    os.remove("target_comm_inCC.bed")
    os.remove(comm_out_name)


# 5 ~7k Insulated Neighborhoods simulated as beads on a string by genomic position to recover rough X,Y,Z  nuclear coordinate

In [None]:
def add_xyz_loc(G, xyz_bed):
    sub_nodes = str(uuid.uuid4()) + ".txt"
    G_nodes = open(sub_nodes,"w+")
    for node in G.nodes():
        arr = re.split(r"[-:]",node)
        G_nodes.write(arr[0] + "\t" + str(arr[1]) + "\t" + str(arr[2]) + "\n")
    G_nodes.close()
    xyz_nodes = str(uuid.uuid4()) + ".txt"
    os.system("sort-bed " + sub_nodes + " | closest-features --closest - "+ xyz_bed + " > " + xyz_nodes)
    with open(xyz_nodes,"r") as xyz_nodes_file:
        for global_coord_node in xyz_nodes_file:
            arr = global_coord_node.strip().split("|")
            graph_node = arr[0].split("\t")
            xyz_node = arr[1].split("\t")
            id_node = graph_node[0]+":"+graph_node[1]+"-"+graph_node[2]
            global_pos = (xyz_node[3]).split(",")
            G.node[id_node]["x"] = global_pos[0]
            G.node[id_node]["y"] = global_pos[1]
            G.node[id_node]["z"] = global_pos[2]
    os.remove(sub_nodes)
    os.remove(xyz_nodes)
    return G