In [None]:
import os
import re
import glob
from datetime import datetime
from typing import List
from pprint import pprint
import pandas as pd
from matplotlib import pyplot as plt

from biLouvian_helper import *

In [None]:
matching_files = glob.glob("checkpoints/LCountry_bipartite_active*.csv")
matching_files = sorted(matching_files)

file_regex = r"bipartite_active_from(?P<date_from>\d{4}-\d\d-\d\d).*to(?P<date_to>\d{4}-\d\d-\d\d)"

results: List[CommunityResultTime] = []
for edgelist in matching_files:
    edgelist = os.path.splitext(edgelist)[0]
    print(edgelist)

    matches = re.finditer(file_regex, edgelist, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        date_from = match.group("date_from")
        date_to = match.group("date_to")

    mutaraplus = result_mutaraplus(edgelist)
    comm_result = result_community(edgelist)

    comm_time_result = CommunityResultTime(
        community=comm_result,
        country="all",
        time_from=datetime.strptime(date_from, "%Y-%m-%d"),
        time_to=datetime.strptime(date_to, "%Y-%m-%d"),
    )
    results.append(comm_time_result)

# Calculate similarity

In [None]:
import networkx as nx
import numpy as np

G = nx.DiGraph()

# calculate the similarity between pairs of coclusters in a and b
positions = {}

results = sorted(results, key=lambda x: x.time_from)
for t, pair in enumerate(zip(results[:-1], results[1:])):
    a, b = pair
    for i, cocluster_a in enumerate(a.community.coclusters):
        G.add_node(cocluster_a, time_from=a.time_from, label=cocluster_a.first.member)
        positions[cocluster_a] = np.array([t, i])
        for j, cocluster_b in enumerate(b.community.coclusters):
            G.add_node(cocluster_b, time_from=b.time_from, label=cocluster_b.first.member)
            positions[cocluster_b] = np.array([t + 1, j])
            sim_tag = cocluster_a.similarity_first(cocluster_b)
            sim_lender = cocluster_a.similarity_second(cocluster_b)
            sim = min(sim_tag, sim_lender)
            # sim = cocluster_a.similarity(cocluster_b)
            if sim > 0.0:
                G.add_edge(
                    cocluster_a,
                    cocluster_b,
                    weight=sim,
                    sim_tag=sim_tag,
                    sim_lender=sim_lender,
                    label=str((round(sim_tag, 2), round(sim_lender, 2))),
                )

In [None]:
# Remove the attributes 'color' from all edges
for u, v, data in G.edges(data=True):
    if "color" in data:
        del data["color"]

# Iterate over the nodes in the graph
for node in G.nodes:
    # get the incomming edges of the current node
    incoming_edges = G.in_edges(node, data=True)

    # Find the edge with the highest weight
    max_weight = 0
    max_weight_edge = None
    for edge in incoming_edges:
        weight = edge[2]["weight"]
        if weight > max_weight:
            max_weight = weight
            max_weight_edge = edge

    # Highlight the edge with the highest weight
    if max_weight_edge is not None:
        G[max_weight_edge[0]][max_weight_edge[1]]["color"] = "red"

# Set the edge colors based on the 'color' attribute
edge_colors = [G[u][v].get("color", "gray") for u, v in G.edges]
edge_weights = [data["weight"] for _, _, data in G.edges(data=True)]

# Set the figure size
plt.figure(figsize=(10, 6))

# expand the posision of the nodes
zoom = 2
pos = {k: v * zoom for k, v in positions.items()}

# Draw the graph with highlighted edges
nx.draw(
    G,
    pos,
    with_labels=False,
    node_size=100,
    node_color="lightblue",
    edge_color=edge_colors,
    width=edge_weights,
    arrowsize=5,
)

edge_labels = {
    (u, v): (round(data.get("sim_tag", 0), 2), round(data.get("sim_lender", 0), 2)) for u, v, data in G.edges(data=True)
}

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8, label_pos=0.38, alpha=0.7)

# Add time_from at bottom of each column
for t, result in enumerate(results):
    plt.text(t * zoom, -1, result.time_from.strftime("%Y-%m-%d"), ha="center", va="bottom")

# Add text box in the top right corner
plt.text(
    14,
    10,
    "Label: (sim_tag, sim_lender)\nsimilarity between coclusters",
    fontsize=10,
    bbox=dict(facecolor="white", edgecolor="black", boxstyle="round"),
)


# Show the plot
plt.show()

In [None]:
# convert datetime to string because gexf cannot serialize datetime
for node, data in G.nodes(data=True):
    data["time_from"] = data["time_from"].strftime("%Y-%m-%d")

nx.write_gexf(G, "checkpoints/LCountry_bipartite_active_cocluster_similarity.gexf")

In [None]:
similarity = []
for t, pair in enumerate(zip(results[:-1], results[1:])):
    a, b = pair
    # get V2 community from a
    real_clusters = []
    for cocluster in a.community.coclusters:
        assert cocluster.second.type == VertexType.V2  # lender
        real_clusters.append(cocluster.second.member)

    # get V2 community from b
    computed_clusters = []
    for cocluster in b.community.coclusters:
        assert cocluster.second.type == VertexType.V2  # lender
        computed_clusters.append(cocluster.second.member)

    similarity.append(
        {
            "from": a.time_from.strftime("%Y-%m-%d"),
            "to": b.time_from.strftime("%Y-%m-%d"),
            "quality": quality_measure(real_clusters, computed_clusters),
        }
    )

similarity = pd.DataFrame(similarity)
print(similarity.to_markdown(index=False))