In [None]:
import os
import re
import glob
from datetime import datetime
from typing import List
from pprint import pprint
import pandas as pd
from matplotlib import pyplot as plt

from biLouvian_helper import *

In [None]:
matching_files = glob.glob("checkpoints/LS_bipartite_*.csv")
matching_files = sorted(matching_files)

file_regex = r"bipartite_(?P<country>\w+)_from(?P<date_from>\d{4}-\d\d-\d\d).*to(?P<date_to>\d{4}-\d\d-\d\d)"
comm_regex = r"^Community (?P<community_id>\d+)\[(?P<vertex_type>V\d+)\]: (?P<vertexes>.*)$"
clus_regex = r"^CoCluster (?P<cocluster_id>\d+):(?P<vertex_type>V\d+)\((?P<a_id>\d+)\)-(?P<b_id>\d+)$"

results: List[CommunityResultTime] = []
for edgelist in matching_files:
    edgelist = os.path.splitext(edgelist)[0]
    print(edgelist)

    matches = re.finditer(file_regex, edgelist, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        country = match.group("country")
        date_from = match.group("date_from")
        date_to = match.group("date_to")

    community_file = edgelist + "_ResultsCommunities.txt"
    if not os.path.exists(community_file):
        continue
    with open(community_file, "r") as f:
        text = f.read()

    clusters: List[ClusterItem] = []
    matches = re.finditer(comm_regex, text, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        community_id = match.group("community_id")
        vertex_type = match.group("vertex_type")
        vertexes = match.group("vertexes").split(", ")[0]
        vertexes = vertexes.split(",")
        # if vertex_type == "V1":
        # print(community_id, vertex_type, vertexes)
        cluster = ClusterItem(community_id=int(community_id), member=set(vertexes), type=VertexType(vertex_type))
        clusters.append(cluster)

    with open(f"{edgelist}_ResultsCoClusterCommunities.txt", "r") as f:
        text = f.read()

    coclusters: List[CoClusterItem] = []
    matches = re.finditer(clus_regex, text, re.MULTILINE)

    # NOTE: Becareful, the cocluster could be duplicated. like this:
    # CoCluster 1:V1(1)-2
    # CoCluster 2:V2(2)-1

    for matchNum, match in enumerate(matches, start=1):
        cocluster_id = match.group("cocluster_id")
        vertex_type = match.group("vertex_type")
        a_id = match.group("a_id")
        b_id = match.group("b_id")

        # find the cluster with id a_id in clusters
        for cluster in clusters:
            if cluster.community_id == int(a_id):
                a = cluster
                break
        else:
            raise ValueError(f"cannot find cluster with id {a_id}")

        # do the same for b_id
        for cluster in clusters:
            if cluster.community_id == int(b_id):
                b = cluster
                break
        else:
            raise ValueError(f"cannot find cluster with id {b_id}")

        if VertexType(vertex_type) != "V1":
            # swap a and b
            a, b = b, a
            a_id, b_id = b_id, a_id

        # check if the cocluster (a_id, b_id) is already in the coclusters list
        for _co in coclusters:
            if _co.first.community_id == int(a_id) and _co.second.community_id == int(b_id):
                break
        else:
            # if not, add it
            cocluster = CoClusterItem(
                cocluster_id=int(cocluster_id),
                first=a,
                second=b,
            )
            coclusters.append(cocluster)

    comm_result = CommunityResult(
        clusters=set(clusters),
        coclusters=set(coclusters),
    )

    comm_time_result = CommunityResultTime(
        community=comm_result,
        country=country,
        time_from=datetime.strptime(date_from, "%Y-%m-%d"),
        time_to=datetime.strptime(date_to, "%Y-%m-%d"),
    )
    results.append(comm_time_result)

In [None]:
# drop the results with time_from < 2013-01-01
results = [x for x in results if x.time_from >= datetime(2013, 1, 1)]

In [None]:
pprint(results)

# Calculate similarity

In [None]:
import networkx as nx
import numpy as np

G = nx.DiGraph()

# calculate the similarity between pairs of coclusters in a and b
positions = {}

results = sorted(results, key=lambda x: x.time_from)
for t, pair in enumerate(zip(results[:-1], results[1:])):
    a, b = pair
    for i, cocluster_a in enumerate(a.community.coclusters):
        G.add_node(cocluster_a, time_from=a.time_from, label=cocluster_a.first.member)
        positions[cocluster_a] = np.array([t, i])
        for j, cocluster_b in enumerate(b.community.coclusters):
            G.add_node(cocluster_b, time_from=b.time_from, label=cocluster_b.first.member)
            positions[cocluster_b] = np.array([t + 1, j])
            sim_tag = cocluster_a.similarity_first(cocluster_b)
            sim_lender = cocluster_a.similarity_second(cocluster_b)
            sim = min(sim_tag, sim_lender)
            # sim = cocluster_a.similarity(cocluster_b)
            if sim > 0.0:
                G.add_edge(
                    cocluster_a,
                    cocluster_b,
                    weight=sim,
                    sim_tag=sim_tag,
                    sim_lender=sim_lender,
                    label=str((round(sim_tag, 2), round(sim_lender, 2))),
                )

In [None]:
# Remove the attributes 'color' from all edges
for u, v, data in G.edges(data=True):
    if "color" in data:
        del data["color"]

# Iterate over the nodes in the graph
for node in G.nodes:
    # get the incomming edges of the current node
    incoming_edges = G.in_edges(node, data=True)

    # Find the edge with the highest weight
    max_weight = 0
    max_weight_edge = None
    for edge in incoming_edges:
        weight = edge[2]["weight"]
        if weight > max_weight:
            max_weight = weight
            max_weight_edge = edge

    # Highlight the edge with the highest weight
    if max_weight_edge is not None:
        G[max_weight_edge[0]][max_weight_edge[1]]["color"] = "red"

# Set the edge colors based on the 'color' attribute
edge_colors = [G[u][v].get("color", "gray") for u, v in G.edges]
edge_weights = [data["weight"] for _, _, data in G.edges(data=True)]

# Set the figure size
plt.figure(figsize=(10, 6))

# expand the posision of the nodes
zoom = 2
pos = {k: v * zoom for k, v in positions.items()}

# Draw the graph with highlighted edges
nx.draw(
    G,
    pos,
    with_labels=False,
    node_size=100,
    node_color="lightblue",
    edge_color=edge_colors,
    width=edge_weights,
    arrowsize=5,
)

edge_labels = {
    (u, v): (round(data.get("sim_tag", 0), 2), round(data.get("sim_lender", 0), 2)) for u, v, data in G.edges(data=True)
}

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8, label_pos=0.38, alpha=0.7)

# Add time_from at bottom of each column
for t, result in enumerate(results):
    plt.text(t * zoom, -1, result.time_from.strftime("%Y-%m-%d"), ha="center", va="bottom")

# Add text box in the top right corner
plt.text(
    14,
    10,
    "Label: (sim_tag, sim_lender)\nsimilarity between coclusters",
    fontsize=10,
    bbox=dict(facecolor="white", edgecolor="black", boxstyle="round"),
)


# Show the plot
plt.show()

In [None]:
# convert datetime to string because gexf cannot serialize datetime
for node, data in G.nodes(data=True):
    data["time_from"] = data["time_from"].strftime("%Y-%m-%d")

nx.write_gexf(G, "checkpoints/LS_bipartite_cocluster_similarity.gexf")

try to layout the graph again, using DFS from last to begin

In [None]:
threshold = 0.2
filtered_edges = [(u, v) for u, v, data in G.edges(data=True) if data["weight"] > threshold]
filtered_graph = G.edge_subgraph(filtered_edges)
filtered_graph.number_of_nodes(), filtered_graph.number_of_edges()

In [None]:
# Set the figure size
plt.figure(figsize=(10, 6))

# Draw the graph with highlighted edges
nx.draw(
    filtered_graph,
    pos,
    with_labels=False,
    node_size=100,
    node_color="lightblue",
    edge_color=edge_colors,
    width=edge_weights,
    arrowsize=5,
)

edge_labels = {
    (u, v): (round(data.get("sim_tag", 0), 2), round(data.get("sim_lender", 0), 2))
    for u, v, data in filtered_graph.edges(data=True)
}

nx.draw_networkx_edge_labels(filtered_graph, pos, edge_labels=edge_labels, font_size=8, label_pos=0.38, alpha=0.7)

# Add time_from at bottom of each column
for t, result in enumerate(results):
    plt.text(t * zoom, -1, result.time_from.strftime("%Y-%m-%d"), ha="center", va="bottom")

# Add text box in the top right corner
plt.text(
    14,
    10,
    "Label: (sim_tag, sim_lender)\nsimilarity between coclusters",
    fontsize=10,
    bbox=dict(facecolor="white", edgecolor="black", boxstyle="round"),
)

# Show the plot
plt.show()