analysis the graph of tags

![common tag](./images/graph_schema-TAG_TAG_COMMON_LENDER.png)

we have to note that, the production above is by no mean the weigh we should consider

## Import data and define the weight

In [None]:
from os import path as osp
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt

In [None]:
project_tag = pd.read_csv("../data/gen/project_tags.csv")
project_tag.rename(columns={":START_ID(Loan-ID)": "project_id", ":END_ID": "tag"}, inplace=True)
project_tag.drop(columns=[":TYPE"], inplace=True)
project_tag["tag"] = pd.Categorical(project_tag["tag"], categories=project_tag["tag"].unique(), ordered=True)
project_tag.head()

In [None]:
lender_project = pd.read_csv("../data/gen/lender_project.csv")
lender_project.rename(
    columns={
        ":END_ID(Project-ID)": "project_id",
        ":START_ID(Lender-ID)": "lender_id",
        "loan_shareAmount": "loan_amount",
    },
    inplace=True,
)
lender_project.drop(columns=[":TYPE", "loan_date"], inplace=True)
lender_project.head()

In [None]:
lender_tag = pd.merge(lender_project, project_tag, on="project_id")
lender_tag.head()

In [None]:
assert 0 == lender_tag.duplicated().sum()
lender_tag.drop(columns=["project_id"], inplace=True)
lender_tag.head()

Now, we have a *edge-list* of a bipartite graph with two type of node:

- `lender_id`
- `tag`

the edge attributes are

- `loan_amount`
- `loan-date`

Create a networkx bipartite graph from the edge list

In [None]:
# create the project-tag bipartite graph
B = nx.from_pandas_edgelist(lender_tag, source="lender_id", target="tag", create_using=nx.Graph())
print(nx.is_bipartite(B))

In [None]:
components = list(nx.connected_components(B))
print(f"There are {len(components)} connected components in B.")
for i, component in enumerate(components):
    print(f"Tags for component {i+1}:")
    tags = [i for i in list(component) if type(i) == str]
    print(tags)

In [None]:
source_nodes, target_nodes = bipartite.sets(B)
len(source_nodes), len(target_nodes)

In [None]:
for node in source_nodes:
    B.add_node(node, type="Lender")
for node in target_nodes:
    B.add_node(node, type="Tag")
GEXFFILE = "lender_tag.gexf"
PAJEKFILE = "lender_tag.net"
nx.write_gexf(B, GEXFFILE)
nx.write_pajek(B, PAJEKFILE)

## Jaccard

In [None]:
Gjaccard = bipartite.overlap_weighted_projected_graph(B, target_nodes)
Gjaccard.number_of_nodes(), Gjaccard.number_of_edges()

In [None]:
# naive community finding in the graph using louvain algorithm
resolution = 1.6
communities = nx.community.louvain_communities(Gjaccard, weight="weight", resolution=resolution, seed=123)
modularity = nx.community.modularity(Gjaccard, communities, weight="weight", resolution=1.0)
print(f"The modularity of the community is {modularity:.3f}")
community_index = {node: i for i, community in enumerate(communities) for node in community}
partition = pd.DataFrame.from_dict(community_index, orient="index", columns=["louvain_community"])
partition.reset_index(inplace=True)
partition.rename(columns={"index": "tag"}, inplace=True)
# display
pd.set_option("max_colwidth", None)
partition.groupby("louvain_community").agg(list)

In [None]:
for row in partition.itertuples():
    Gjaccard.nodes[row.tag]["louvain_community"] = row.louvain_community

In [None]:
nx.write_gexf(Gjaccard, f"../data/gen/tag_tag_common_lender_jaccard.gexf")
nx.write_pajek(Gjaccard, f"../data/gen/tag_tag_common_lender_jaccard.net")

In [None]:
pos = nx.spring_layout(Gjaccard, k=0.1, iterations=1000)
edge_weight = nx.get_edge_attributes(Gjaccard, "weight")

# Get the louvain community of each node
louvain_communities = nx.get_node_attributes(Gjaccard, "louvain_community")

# Get a list of unique community IDs
community_ids = list(set(louvain_communities.values()))

# Generate a list of colors for each community
colors = plt.cm.tab20(np.linspace(0, 1, len(community_ids)))

# Map each node to its corresponding color
node_colors = [colors[community_ids.index(louvain_communities[node])] for node in Gjaccard.nodes()]

# Draw the graph with nodes colored by community
nx.draw(Gjaccard, pos=pos, with_labels=True, node_color=node_colors)
nx.draw_networkx_edges(
    Gjaccard, pos, edgelist=edge_weight.keys(), edge_color=edge_weight.values(), edge_cmap=plt.cm.Reds
)
plt.show()

## Hyperbolic weight

In [None]:
Ghyperbolic = bipartite.collaboration_weighted_projected_graph(B, target_nodes)
Ghyperbolic.number_of_nodes(), Ghyperbolic.number_of_edges()

In [None]:
# naive community finding in the graph using louvain algorithm
resolution = 1.5
communities = nx.community.louvain_communities(Ghyperbolic, resolution=resolution, seed=123)
modularity = nx.community.modularity(Gjaccard, communities, weight="weight", resolution=resolution)
print(f"The modularity of the community is {modularity:.3f}")
community_index = {node: i for i, community in enumerate(communities) for node in community}
partition = pd.DataFrame.from_dict(community_index, orient="index", columns=["louvain_community"])
partition.reset_index(inplace=True)
partition.rename(columns={"index": "tag"}, inplace=True)
partition.groupby("louvain_community").agg(list)

In [None]:
for row in partition.itertuples():
    Ghyperbolic.nodes[row.tag]["louvain_community"] = row.louvain_community

In [None]:
nx.write_gexf(Ghyperbolic, f"../data/gen/tag_tag_common_lender_hyperbolic.gexf")
nx.write_pajek(Ghyperbolic, f"../data/gen/tag_tag_common_lender_hyperbolic.net")

In [None]:
pos = nx.spring_layout(Ghyperbolic, k=0.1, iterations=1000)
edge_weight = nx.get_edge_attributes(Ghyperbolic, "weight")

# Get the louvain community of each node
louvain_communities = nx.get_node_attributes(Gjaccard, "louvain_community")

# Get a list of unique community IDs
community_ids = list(set(louvain_communities.values()))

# Generate a list of colors for each community
colors = plt.cm.tab20(np.linspace(0, 1, len(community_ids)))

# Map each node to its corresponding color
node_colors = [colors[community_ids.index(louvain_communities[node])] for node in Gjaccard.nodes()]

# Draw the graph with nodes colored by community
nx.draw(Gjaccard, pos=pos, with_labels=True, node_color=node_colors)
nx.draw_networkx_edges(
    Gjaccard, pos, edgelist=edge_weight.keys(), edge_color=edge_weight.values(), edge_cmap=plt.cm.Reds
)
plt.show()

# test

In [None]:
Gtest = bipartite.overlap_weighted_projected_graph(B, source_nodes)
Gtest.number_of_nodes(), Gtest.number_of_edges()