Find the community in Lender-Tag graph directly

![Lender-Tag](./images/graph_schema-INTEREST.png)

In [None]:
import pandas as pd
import networkx as nx
from sklearn.preprocessing import LabelEncoder

In [None]:
project_tag = pd.read_csv("../data/gen/project_tags.csv")
project_tag.rename(columns={":START_ID(Loan-ID)": "project_id", ":END_ID": "tag"}, inplace=True)
project_tag.drop(columns=[":TYPE"], inplace=True)
project_tag["tag"] = pd.Categorical(project_tag["tag"], categories=project_tag["tag"].unique(), ordered=True)
project_tag.head()

In [None]:
lender_project = pd.read_csv("../data/gen/lender_project.csv")
lender_project.rename(
    columns={
        ":END_ID(Project-ID)": "project_id",
        ":START_ID(Lender-ID)": "lender_id",
        "loan_shareAmount": "loan_amount",
    },
    inplace=True,
)
lender_project.drop(columns=[":TYPE", "loan_date"], inplace=True)
lender_project.head()

In [None]:
LT = pd.merge(lender_project, project_tag, on="project_id")
LT.head()

In [None]:
assert 0 == LT.duplicated().sum()
LT.drop(columns=["project_id"], inplace=True)
LT.head()

In [None]:
le1 = LabelEncoder()
LT["V1"] = le1.fit_transform(LT["tag"])
LT

In [None]:
le2 = LabelEncoder()
LT["V2"] = le2.fit_transform(LT["lender_id"]) + LT["V1"].max() + 1
LT

In [None]:
# total number of vertex
vertex_count = LT["lender_id"].nunique() + LT["tag"].nunique()
assert LT["V1"].nunique() == LT["tag"].nunique()
assert LT["V2"].nunique() == LT["lender_id"].nunique()
print(vertex_count)

In [None]:
# create a dataframe with V1, V2, loan_amount columns
LT["weight"] = LT["loan_amount"].astype(int)
LT[["V1", "V2", "weight"]].to_csv("checkpoints/lender_tag_bipartite.csv", sep="\t", header=False, index=False)

In [None]:
dictionary = LT[["V1", "tag"]].drop_duplicates()
dictionary.rename(columns={"tag": "name", "V1": "id"}, inplace=True)
dictionary.head()

In [None]:
dictionary2 = LT[["V2", "lender_publicId"]].drop_duplicates()
dictionary2.rename(columns={"lender_publicId": "name", "V2": "id"}, inplace=True)
dictionary2.head()

In [None]:
# concat two dictionaries
dictionary = pd.concat([dictionary, dictionary2])
dictionary["name"] = dictionary["name"].astype(str)
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

In [None]:
dictionary.sort_values(by=["id"], inplace=True)

In [None]:
# write the dictionary to file checkpoints/lender_tag_bipartite_Dictionary.txt, without header and index. The first column should be id
dictionary[["id", "name"]].to_csv(
    "checkpoints/lender_tag_bipartite_Dictionary.txt", sep="\t", header=False, index=False
)

In [None]:
dictionary

# Run `biLouvain`

```bash
./biLouvain -d "\t" -i ../src/checkpoints/lender_tag_bipartite.csv -order 2
```

Reference: https://github.com/paolapesantez/biLouvain.git

# export to Gephi

parse result file. The result file looks like this

```text
Community 0[V1]: 0
Community 1[V1]: 1
Community 2[V2]: 2,3
Community 3[V2]: 4

Singletons Partition V1: 2
Singletons Partition V2: 1
0,1,2,2,3
```

In [None]:
# read the last line of the file
with open("checkpoints/lender_tag_bipartite_ResultsCommunities.txt", "r") as f:
    result_text = f.read()

In [None]:
import re

regex = r"^Community (?P<community_id>\d+)\[V(?P<vertex_type>\d+)\]: (?P<vertexes>.*)$"
matches = re.finditer(regex, result_text, re.MULTILINE)

community_result = []

for matchNum, match in enumerate(matches, start=1):
    community_id = match.group("community_id")
    vertex_type = match.group("vertex_type")
    vertexes = match.group("vertexes").split(", ")[0]
    vertexes = vertexes.split(",")
    print(community_id, vertex_type, vertexes)
    for vertex in vertexes:
        community_result.append([community_id, vertex_type, vertex])

community_result = pd.DataFrame(community_result, columns=["community_id", "vertex_type", "vertex_name"])
community_result.head()

In [None]:
nodes = dictionary.merge(community_result, left_on="name", right_on="vertex_name")
len(nodes)

# Create networkx graph and show the result

In [None]:
G = nx.from_pandas_edgelist(LT, "project_id", "tag")
G.add_nodes_from(LT["project_id"].drop_duplicates(), type="Project")
G.add_nodes_from(LT["tag"].drop_duplicates(), type="Tag")

# refine node attributes with community id
for row in nodes.itertuples():
    G.nodes[row.name]["community_id"] = row.community_id

print(G.number_of_nodes(), G.number_of_edges())

In [None]:
# TODO: show the co-cluster result
# TODO: only draw some lender, since there are too many of them