In [None]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

tqdm.pandas()

In [None]:
LP = pd.read_csv("../data/gen/lender_project.csv")
LP.rename(columns={":END_ID(Project-ID)": "project_id", ":START_ID(Lender-ID)": "lender_id"}, inplace=True)
LP.drop(columns=[":TYPE", "loan_date"], inplace=True)
LP.head()

In [None]:
PT = pd.read_csv("../data/gen/project_tags.csv")
PT.rename(columns={":END_ID": "tag"}, inplace=True)
PT["tag"] = PT["tag"].astype("category")
PT.drop(columns=[":TYPE"], inplace=True)
PT.head()

In [None]:
merged = pd.merge(LP, PT, on="project_id")
merged.info()

In [None]:
assert 0 == merged.duplicated().sum()
merged.drop(columns=["project_id"], inplace=True)
merged

In [None]:
LT = merged.groupby(["lender_id", "tag"]).agg({"lender_publicId": "first", "loan_shareAmount": "sum"}).reset_index()
LT.rename(columns={"loan_shareAmount": "loan_amount"}, inplace=True)
LT = LT[LT["loan_amount"] > 0]
LT

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
LT["V1"] = le.fit_transform(LT["lender_id"])
LT

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
LT["V2"] = le.fit_transform(LT["tag"]) + LT["V1"].max() + 1
LT

In [None]:
# total number of vertex
vertex_count = LT["lender_id"].nunique() + LT["tag"].nunique()
assert vertex_count == LT["V1"].nunique() + LT["V2"].nunique()
print(vertex_count)

In [None]:
# create a dataframe with V1, V2, loan_amount columns
LT[["V1", "V2", "loan_amount"]].to_csv("checkpoints/lender_tag_bipartite.csv", sep="\t", header=False, index=False)

In [None]:
dictionary = LT[["V1", "lender_publicId"]].drop_duplicates()
dictionary.rename(columns={"lender_publicId": "name", "V1": "id"}, inplace=True)
dictionary.head()

In [None]:
dictionary2 = LT[["V2", "tag"]].drop_duplicates()
dictionary2.rename(columns={"tag": "name", "V2": "id"}, inplace=True)
dictionary2.head()

In [None]:
# concat two dictionaries
dictionary = pd.concat([dictionary, dictionary2])
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

In [None]:
# write the dictionary to file checkpoints/lender_tag_bipartite_Dictionary.txt, without header and index. The first column should be id
dictionary[["id", "name"]].to_csv(
    "checkpoints/lender_tag_bipartite_Dictionary.txt", sep="\t", header=False, index=False
)

# Run `biLouvain`

```bash
./biLouvain -d "\t" -i ../src/checkpoints/lender_tag_bipartite.csv  
```

Reference: https://github.com/paolapesantez/biLouvain.git

# export to Gephi

parse result file. The result file looks like this

```text
Community 0[V1]: 0
Community 1[V1]: 1
Community 2[V2]: 2,3
Community 3[V2]: 4

Singletons Partition V1: 2
Singletons Partition V2: 1
0,1,2,2,3
```

In [None]:
# read the last line of the file
with open("checkpoints/lender_tag_bipartite_ResultsCommunities.txt", "r") as f:
    result_text = f.read()

In [None]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re

regex = r"^Community (?P<community_id>\d+)\[V(?P<vertex_type>\d+)\]: (?P<vertexes>.*)$"
matches = re.finditer(regex, result_text, re.MULTILINE)

community_result = []

for matchNum, match in enumerate(matches, start=1):
    community_id = match.group("community_id")
    vertex_type = match.group("vertex_type")
    vertexes = match.group("vertexes").split(", ")[0]
    vertexes = vertexes.split(",")
    for vertex in vertexes:
        community_result.append([community_id, vertex_type, vertex])

community_result = pd.DataFrame(community_result, columns=["community_id", "vertex_type", "vertex"])
community_result["vertex"] = community_result["vertex"].astype(int)
community_result

In [None]:
dictionary = dictionary.merge(community_result, left_on="id", right_on="vertex").drop(columns=["id"])
dictionary.head()

# Create networkx graph and show the result

In [None]:
G = nx.from_pandas_edgelist(LT, "lender_publicId", "tag", "loan_amount")
G.add_nodes_from(LT["lender_publicId"].drop_duplicates(), type="Lender")
G.add_nodes_from(LT["tag"].drop_duplicates(), type="Tag")

# refine node attributes with community id
for row in dictionary.itertuples():
    G.nodes[row.name]["community_id"] = row.community_id

G.number_of_nodes(), G.number_of_edges()
nx.write_gexf(G, "checkpoints/lender_tag_bipartite_community.gexf")