# Plot coverage/length summary of graph components

Abstracted to its own notebook so it can be run by multiple notebooks (both of the graph notebooks -- one of them I used while developing this on my laptop, the other one I use while rerunning stuff on the cluster).

Depends on a variety of variables already being defined, some by `Header.ipynb` and others by a graph notebook (`sorted_components`, `node2len`, etc.)

In [None]:
cc_lengths = []
cc_agg_covs = []
cc_colors = []

edge2color = {"6104": "#00cc00", "1671": "#ff0000", "2358": "#880088"}

# For a nice table showing how to interpret these, see https://matplotlib.org/stable/api/markers_api.html
edge2marker = {"6104": "s", "1671": "P", "2358": "^"}
    
fig, ax = pyplot.subplots(1)
for i, cmp in enumerate(sorted_components):
    color = SCATTERPLOT_PT_COLOR
    total_edge_len = 0
    cov_times_len_for_all_edges = 0
    
    found_node_in_cc = None
    
    for node in cmp:
        total_edge_len += node2len[node]
        cov_times_len_for_all_edges += (node2len[node] * node2cov[node])
        # Color this cmp specially if it contains one of the edges we care about.
        # NOTE that the behavior of this is undefined if this contains more than one of these "special" edges
        # (this shouldn't be the case, since each edge we care about should be in a diff component).
        if node in edge2color:
            color = edge2color[node]
            found_node_in_cc = node
    agg_cov = cov_times_len_for_all_edges / total_edge_len
    
    # There isn't a way to assign different markers to different points in the same ax.scatter() command.
    # However, calling ax.scatter() once for each point in this plot takes forever on my laptop, since there are
    # a ton of points here! So we can get the best of both worlds (*and* still use a label for these N ccs,
    # like in the Within-Gene plots notebook) only calling ax.scatter() N+1 times: once for the N ccs we want to
    # highlight and label uniquely, and then once for all the other ccs.
    # See https://stackoverflow.com/a/43622421.
    if found_node_in_cc is not None:
        
        cc_for = seq2name[f"edge_{found_node_in_cc}"]
        ax.scatter(
            total_edge_len, agg_cov,
            c=color, marker=edge2marker[found_node_in_cc], s=100, label=cc_for
        )        
    elif i == 0:
        # Highlight the hairball component.
        # Note that, of course, if the hairball component CONTAINS one of the special edges we highlighted just
        # now, then that styling will trump this... But really, we shouldn't be using genomes from within the
        # hairball component! For this analysis, at least :)
        #
        # The \ast thing, which shows an * for the hairball component, is doable using a special case in
        # matplotlib's marker handling: see https://matplotlib.org/stable/api/markers_api.html and
        # https://matplotlib.org/stable/tutorials/text/mathtext.html#symbols.
        ax.scatter(
            total_edge_len, agg_cov,
            c="#000000", marker="$\\ast$", s=100, label='"Hairball" component'
        )
    else:
        cc_lengths.append(total_edge_len)
        cc_agg_covs.append(agg_cov)
        cc_colors.append(color)
        
    if total_edge_len >= 10**6 and agg_cov >= 1000:
        edge_str = str(cmp)[1:-1].replace("'", "")
        print(f"Component #{i} passes length and coverage cutoffs! Has {len(cmp)} edges: {edge_str}")
        
print(f"Minimum length of all non-hairball / non-special components: {min(cc_lengths)}")
print(f"Minimum agg cov of all non-hairball / non-special components: {min(cc_agg_covs)}")
print(f"Number of 0-agg-cov non-hairball / non-special components: {cc_agg_covs.count(0)}")

ax.scatter(cc_lengths, cc_agg_covs, c=cc_colors, label="All other components")
ax.set_title("Total edge length vs. \"average\" edge coverage\nfor all connected components of the assembly graph")
ax.set_xlabel("$\sum_{e \in \mathrm{Edges}} \mathrm{Length}(e)$")
ax.set_ylabel("$\\dfrac{\sum_{e \in \mathrm{Edges}} \mathrm{Coverage}(e) \\times \mathrm{Length}(e)}{\sum_{e \in \mathrm{Edges}} \mathrm{Length}(e)}$", rotation=0, labelpad=100)
ax.set_xscale("symlog")
ax.set_yscale("symlog")
ax.set_xlim(0)
ax.set_ylim(0)
ax.axvline(x=(10**6), color="#000000", linestyle="-.", label="Length cutoff: 1 Mbp")
ax.axhline(y=(10**3), color="#000000", linestyle="--", label="Coverage cutoff: 1,000x")
ax.legend()
fig.set_size_inches(10, 10)
fig.savefig("figs/cc_len_cov_summary.png", bbox_inches="tight")