In [None]:
import os

from natsort import natsorted

import networkx as nx

import pandas as pd

import plotly.express as px

In [None]:
DATA_DIR = "../data/"

## KOMODO

In [None]:
komodo_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "komodo",
        "komodo.tsv"
    ),
    sep="\t"
)

# Transform taxon ID column to string for plotting
komodo_df["Taxon ID"] = komodo_df["Taxon ID"].astype(str)

# Add genus column
komodo_df["Genus"] = komodo_df["Organism Name"].str.split(" ").str[0]

komodo_df

In [None]:
komodo_df["Taxon ID"].nunique(), komodo_df["Media"].nunique()

In [None]:
taxon_counts = komodo_df["Taxon ID"]\
    .value_counts()\
    .reset_index(name="Count")

genus_counts = komodo_df["Genus"]\
    .value_counts()\
    .reset_index(name="Count")

media_counts = komodo_df["Media"]\
    .value_counts()\
    .reset_index(name="Count")

In [None]:
top_taxons = 20

fig = px.bar(
    data_frame=taxon_counts.head(top_taxons),
    x="Taxon ID",
    y="Count",
    color="Count",
    title=f"Distribution of taxon IDs in KOMODO (top {top_taxons})",
    template="plotly_white"
)
fig.show()

In [None]:
top_genera = 100

fig = px.bar(
    data_frame=genus_counts.head(top_genera),
    x="Genus",
    y="Count",
    color="Count",
    title=f"Distribution of genera in KOMODO (top {top_genera})",
    template="plotly_white"
)
fig.show()

In [None]:
fig = px.violin(
    data_frame=genus_counts,
    y="Count",
    hover_data=["Genus"]
)
fig.show()

In [None]:
top_media = 25

fig = px.bar(
    data_frame=media_counts.head(top_media),
    x="Media",
    y="Count",
    color="Count",
    height=900,
    title=f"Distribution of media in KOMODO (top {top_media})",
    template="plotly_white"
)
fig.show()

In [None]:
fig = px.violin(
    data_frame=media_counts,
    y="Count",
    hover_data=["Media"]
)
fig.show()

### Graph visualizations

In [None]:
# Assign IDs to media long names
media_mapping = pd.DataFrame({
    "Media": komodo_df["Media"].unique(),
    "Media ID": [f"M-{x}" for x in range(komodo_df["Media"].nunique() + 1)]
})

komodo_df = pd.merge(
    left=komodo_df,
    right=media_mapping,
    on="Media",
    how="left"
)
komodo_df[["Media", "Media ID"]]

In [None]:
komodo_adj = komodo_df[["Genus", "Media ID"]].dropna().copy()
komodo_adj = pd.crosstab(komodo_adj["Genus"], komodo_adj["Media ID"])
komodo_adj = komodo_adj[natsorted(komodo_adj.columns)]

komodo_adj

In [None]:
# fig = px.imshow(
#     komodo_adj,
#     width=1200,
#     height=900
# )
# fig.show()

In [None]:
G = nx.from_pandas_edgelist(
    df=komodo_df[["Genus", "Media ID"]].dropna(),
    source="Genus",
    target="Media ID",
    edge_attr=None
)

# Add note attributes to graph
genus_df = komodo_df["Genus"]\
    .value_counts()\
    .reset_index(name="count")\
    .rename(columns={"Genus": "node_id"})\
    .copy()
genus_df["group"] = "Genus"

media_df = komodo_df["Media ID"]\
    .value_counts()\
    .reset_index(name="count")\
    .rename(columns={"Media ID": "node_id"})\
    .copy()
media_df["group"] = "Media"

node_attr = pd.concat(
    [genus_df, media_df],
    axis=0,
    ignore_index=True
)
node_attr["title"] = \
    "Node: " + node_attr["node_id"] + "\r" + \
    "Type: " + node_attr["group"] + "\r" + \
    "Count: " + node_attr["count"].astype(str)
node_attr = node_attr.set_index("node_id").to_dict(orient="index")

nx.set_node_attributes(G=G, values=node_attr)

for n, d in G.nodes(data=True):
    print(n, d)

In [None]:
# import nxviz as nv

# nv.hive(
#     G,
#     group_by="group",
#     sort_by="count",
#     node_color_by="count",
#     # edge_alpha_by="edge_value"
# )

In [None]:
from pyvis.network import Network

net = Network(
    height="1000px",
    width="100%",
    bgcolor="#222222",
    font_color="white",
    select_menu=True,
    filter_menu=True,
    notebook=True,
    cdn_resources="in_line"
)
net.from_nx(G)
net.toggle_physics(False)
# net.show_buttons(filter_=['physics'])
net.show("media-graph-no-phys.html")

In [None]:
from pyvis.network import Network

net = Network(
    height="1000px",
    width="100%",
    bgcolor="#222222",
    font_color="white",
    select_menu=True,
    filter_menu=True,
    notebook=True,
    cdn_resources="in_line"
)
net.from_nx(G)
net.toggle_physics(True)
# net.show_buttons(filter_=['physics'])
net.show("media-graph.html")

### Some insights

* The KOMODO database seems to be biased towards Streptomyces: both top medium and genus are related to it.

* This bias is highly exacerbated at the genus level.

* Approximately top 7 media and genus seem to dominate the dataset.

* Null taxon IDs are the most abundant IDs by far.