analysis the graph of tags

![common tag](./images/graph_schema-TAG_TAG_COMMON_PROJECT.png)

we have to note that, the production above is by no mean the weigh we should consider

# Import bipartite data and define the weight

In [None]:
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
from matplotlib import pyplot as plt
import forceatlas2

from tqdm import tqdm

tqdm.pandas()

In [None]:
project_tag = pd.read_csv("../data/gen/project_tags.csv")
project_tag.rename(columns={":START_ID(Loan-ID)": "project_id", ":END_ID": "tag"}, inplace=True)
project_tag.drop(columns=[":TYPE"], inplace=True)
project_tag["tag"] = project_tag["tag"].astype("category").cat.as_ordered()
project_tag.head()

In [None]:
B = nx.from_pandas_edgelist(project_tag, source="project_id", target="tag", create_using=nx.Graph())
source_nodes, target_nodes = bipartite.sets(B)
len(source_nodes), len(target_nodes)

In [None]:
for node in source_nodes:
    B.add_node(node, type="Project")
for node in target_nodes:
    B.add_node(node, type="Tag")
nx.write_gexf(B, "../data/gen/project_tags.gexf")

Notice that, the above is the edge-list of a bipartite graph, where the node types is `project_id` and `tag`.
To study such graphs, with the hypothesis that there might have a *community* of tags, which contribute a same type of impact.

Now, do a *bipartite* project on the *tag*. We will try two popular weights:

- *Jaccard index* or Intersection over Union

  $$w_{u,v} = \frac {\mid N(u) \cap N(v) \mid} {\mid N(u) \cup N(v) \mid}$$

  Borgatti, S.P. and Halgin, D. In press. Analyzing Affiliation Networks. In Carrington, P. and Scott, J. (eds) The Sage Handbook of Social Network Analysis. Sage Publications.


- *Hyperbolic weight*

  $$w_{u, v} = \sum_k \frac{\delta^k_u\delta^k_v}{d_k - 1}$$

  Scientific collaboration networks: II. Shortest paths, weighted networks, and centrality, M. E. J. Newman, Phys. Rev. E 64, 016132 (2001).

  where:
  - $d_k$ is degree of node $k$
  - $\delta_{u}^{k}$ is $1$ if node $u$ is linked to node $k$, $0$ otherwise



## Jaccard weight

In [None]:
Gjaccard = bipartite.overlap_weighted_projected_graph(B, target_nodes)
Gjaccard.number_of_nodes(), Gjaccard.number_of_edges()

In [None]:
pos = forceatlas2.forceatlas2_networkx_layout(Gjaccard, niter=1000, scalingRatio=20.0)
edge_weight = nx.get_edge_attributes(Gjaccard, "weight")

nx.draw(Gjaccard, pos=pos, with_labels=True, node_color="red")
nx.draw_networkx_edges(
    Gjaccard, pos, edgelist=edge_weight.keys(), edge_color=edge_weight.values(), edge_cmap=plt.cm.Reds
)
plt.show()

In [None]:
# naive community finding in the graph using louvain algorithm
community = nx.community.louvain_communities(Gjaccard, resolution=1.1, seed=123)
community_index = {node: i for i, community in enumerate(community) for node in community}
partition = pd.DataFrame.from_dict(community_index, orient="index", columns=["louvain_community"])
partition.reset_index(inplace=True)
partition.rename(columns={"index": "tag"}, inplace=True)
partition.groupby("louvain_community").agg(list)

In [None]:
for row in partition.itertuples():
    Gjaccard.nodes[row.tag]["louvain_community"] = row.louvain_community

In [None]:
nx.write_gexf(Gjaccard, "../data/gen/tag_tag_common_loan_jaccard.gexf")

## Hyperbolic weight

In [None]:
Ghyperbolic = bipartite.collaboration_weighted_projected_graph(B, target_nodes)
Ghyperbolic.number_of_nodes(), Ghyperbolic.number_of_edges()

In [None]:
pos = forceatlas2.forceatlas2_networkx_layout(Ghyperbolic, niter=1000, scalingRatio=20.0)
edge_weight = nx.get_edge_attributes(Ghyperbolic, "weight")

nx.draw(Ghyperbolic, pos=pos, with_labels=True, node_color="red")
nx.draw_networkx_edges(
    Ghyperbolic, pos, edgelist=edge_weight.keys(), edge_color=edge_weight.values(), edge_cmap=plt.cm.Reds
)
plt.show()

In [None]:
# naive community finding in the graph using louvain algorithm
community = nx.community.louvain_communities(Ghyperbolic, resolution=1.1, seed=123)
community_index = {node: i for i, community in enumerate(community) for node in community}
partition = pd.DataFrame.from_dict(community_index, orient="index", columns=["louvain_community"])
partition.reset_index(inplace=True)
partition.rename(columns={"index": "tag"}, inplace=True)
partition.groupby("louvain_community").agg(list)

In [None]:
for row in partition.itertuples():
    Ghyperbolic.nodes[row.tag]["louvain_community"] = row.louvain_community

In [None]:
nx.write_gexf(Ghyperbolic, "../data/gen/tag_tag_common_loan_hyperbolic.gexf")

# Community finding

# OLD - Manually create Jaccard weight

project the bipartite graph onto the *tag* nodes

In [None]:
merged = project_tag.merge(project_tag, on="project_id")
merged.head(3)

in the above table, we have a project_id and the 2 tags have been in that project.   
Notice that, when doing the merge, the `tag_x` and `tag_y` could be the same. We will filter out that.
Also because of the symmetrically nature of the weight. We will keep only half of the table, where `tag_x` > `tag_y`

In [None]:
filtered = merged[merged["tag_x"] > merged["tag_y"]]
filtered.head()

In [None]:
inter = filtered.groupby(["tag_x", "tag_y"]).nunique()
inter.rename(columns={"project_id": "union"}, inplace=True)
inter.reset_index(inplace=True)
inter.head()

Note that

$${|T_1 \cup T_2|} = |T_1| + |T_2| - |T_1 \cap T_2|$$

In [None]:
pro_by_tag = project_tag.groupby("tag").nunique()
pro_by_tag.rename(columns={"project_id": "nunique"}, inplace=True)
pro_by_tag.reset_index(inplace=True)
pro_by_tag.head()

In [None]:
pair = (
    inter.merge(pro_by_tag, left_on="tag_x", right_on="tag")
    .drop(columns=["tag"])
    .rename(columns={"nunique": "nunique_x"})
)
pair.head()

In [None]:
pair = (
    pair.merge(pro_by_tag, left_on="tag_y", right_on="tag")
    .drop(columns=["tag"])
    .rename(columns={"nunique": "nunique_y"})
)
pair

In [None]:
pair["overlap"] = pair["nunique_x"] + pair["nunique_y"] - pair["union"]
pair

In [None]:
pair = pair[pair["union"] > 0]

In [None]:
pair["weight"] = pair["union"] / pair["overlap"]
pair

`pair` and `Gjaccard` should present the same graph

In [None]:
assert len(pair) == Gjaccard.number_of_edges()

In [None]:
# get a random sample of pair
for row in pair.itertuples():
    assert row.weight == Gjaccard.get_edge_data(row.tag_x, row.tag_y)["weight"]