In [None]:
import pandas as pd
import networkx as nx
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

tqdm.pandas()

In [None]:
PT = pd.read_csv("../data/gen/project_tags.csv")
PT.rename(columns={":END_ID": "tag"}, inplace=True)
PT["project_id"] = PT["project_id"].astype(str)
PT["tag"] = PT["tag"].astype("category")
PT.drop(columns=[":TYPE"], inplace=True)
PT.head()

In [None]:
PT.info()

In [None]:
le = LabelEncoder()
PT["V1"] = le.fit_transform(PT["tag"])
PT

In [None]:
le = LabelEncoder()
PT["V2"] = le.fit_transform(PT["project_id"]) + PT["V1"].max() + 1
PT

In [None]:
# total number of vertex
vertex_count = PT["project_id"].nunique() + PT["tag"].nunique()
assert PT["V1"].nunique() == PT["tag"].nunique()
assert PT["V2"].nunique() == PT["project_id"].nunique()
print(vertex_count)

In [None]:
# create a dataframe with V1, V2, loan_amount columns
PT["weight"] = 1
PT[["V1", "V2", "weight"]].to_csv("checkpoints/project_tag_bipartite.csv", sep="\t", header=False, index=False)

In [None]:
dictionary = PT[["V1", "tag"]].drop_duplicates()
dictionary.rename(columns={"tag": "name", "V1": "id"}, inplace=True)
dictionary.head()

In [None]:
dictionary2 = PT[["V2", "project_id"]].drop_duplicates()
dictionary2.rename(columns={"project_id": "name", "V2": "id"}, inplace=True)
dictionary2.head()

In [None]:
# concat two dictionaries
dictionary = pd.concat([dictionary, dictionary2])
dictionary["name"] = dictionary["name"].astype(str)
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

In [None]:
dictionary.sort_values(by=["id"], inplace=True)

In [None]:
# write the dictionary to file checkpoints/lender_tag_bipartite_Dictionary.txt, without header and index. The first column should be id
dictionary[["id", "name"]].to_csv(
    "checkpoints/project_tag_bipartite_Dictionary.txt", sep="\t", header=False, index=False
)

In [None]:
dictionary

# Run `biLouvain`

```bash
./biLouvain -d "\t" -i ../src/checkpoints/project_tag_bipartite.csv -order 2
```

Reference: https://github.com/paolapesantez/biLouvain.git

# export to Gephi

parse result file. The result file looks like this

```text
Community 0[V1]: 0
Community 1[V1]: 1
Community 2[V2]: 2,3
Community 3[V2]: 4

Singletons Partition V1: 2
Singletons Partition V2: 1
0,1,2,2,3
```

In [None]:
# read the last line of the file
with open("checkpoints/project_tag_bipartite_ResultsCommunities.txt", "r") as f:
    result_text = f.read()

In [None]:
import re

regex = r"^Community (?P<community_id>\d+)\[V(?P<vertex_type>\d+)\]: (?P<vertexes>.*)$"
matches = re.finditer(regex, result_text, re.MULTILINE)

community_result = []

for matchNum, match in enumerate(matches, start=1):
    community_id = match.group("community_id")
    vertex_type = match.group("vertex_type")
    vertexes = match.group("vertexes").split(", ")[0]
    vertexes = vertexes.split(",")
    print(community_id, vertex_type, vertexes)
    for vertex in vertexes:
        community_result.append([community_id, vertex_type, vertex])

community_result = pd.DataFrame(community_result, columns=["community_id", "vertex_type", "vertex_name"])
community_result.head()

In [None]:
nodes = dictionary.merge(community_result, left_on="name", right_on="vertex_name")
len(nodes)

# Create networkx graph and show the result

In [None]:
G = nx.from_pandas_edgelist(PT, "project_id", "tag")
G.add_nodes_from(PT["project_id"].drop_duplicates(), type="Project")
G.add_nodes_from(PT["tag"].drop_duplicates(), type="Tag")

# refine node attributes with community id
for row in nodes.itertuples():
    G.nodes[row.name]["community_id"] = row.community_id

print(G.number_of_nodes(), G.number_of_edges())
nx.write_gexf(G, "checkpoints/project_tag_bipartite_community.gexf")