In [None]:
# parameter cell
FROM = "2022-01-01"
TO = "2023-01-01"
COUNTRY = "United States"
ORDER = 1  # will be pass to param order in biLouvian call

In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm
import networkx as nx
from sklearn.preprocessing import LabelEncoder

tqdm.pandas()

# Import preprocessed data with active lenders only, and filter by the Paramteres

In [None]:
ds = pd.read_parquet("../data/gen/preprocessed_2023-08-28T11-09-39_from_2019-01-01_activelender.parquet")
ds["sector_name"] = ds["sector_name"].astype("category")
ds["geocode_country_name"] = ds["geocode_country_name"].astype("category")
ds["activity_name"] = ds["activity_name"].astype("category")
ds.drop(columns=["tags"], inplace=True)

In [None]:
ds.sector_name.isna().sum()

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
print(len(ds))
ds.tail(2)

In [None]:
ds = ds[(ds.fundraisingDate >= FROM) & (ds.fundraisingDate < TO)]
"the number of Loans under investigation is", len(ds)

In [None]:
ds = ds[ds.geocode_country_name == COUNTRY]

In [None]:
ds.drop_duplicates(inplace=True)

In [None]:
ds.duplicated(subset=["project_id", "lender_id", "loan_shareAmount", "loan_date"]).sum()

# Stats

In [None]:
"number of Projects", ds.project_id.nunique()

In [None]:
"number of Lenders", ds.lender_id.nunique()

In [None]:
"number of Loans", ds.loan_id.nunique()

In [None]:
"number of Sectors", ds.sector_name.nunique()

# Contruct `Lender-Sector` bipartite graph

## Create edge list of the graph

In [None]:
LS = ds.groupby(["lender_id", "sector_id"], observed=True).agg(
    {"sector_name": "first", "loan_shareAmount": "sum", "loan_date": "count", "lender_publicId": "first"}
)
LS.reset_index(inplace=True)
LS.rename(columns={"loan_shareAmount": "loan_amount", "loan_date": "loan_count"}, inplace=True)
LS = LS[LS["loan_amount"] > 0]
LS

## Convert to the format that `biLouvian` can understand

In [None]:
# must convert sector_name from category to object first
# because if it is category, the LabelEncoder will return the index of the category
# which is not desired. We want the encoded value to be in the range of [0, n-1]
LS["sector_name"] = LS["sector_name"].astype("object")
le1 = LabelEncoder()
LS["V1"] = le1.fit_transform(LS["sector_name"])
le2 = LabelEncoder()
LS["V2"] = le2.fit_transform(LS["lender_id"]) + LS["V1"].max() + 1
LS.tail(3)

In [None]:
# total number of vertex
vertex_count = LS["lender_id"].nunique() + LS["sector_name"].nunique()
assert vertex_count == LS["V1"].nunique() + LS["V2"].nunique()
print(vertex_count)

In [None]:
dictionary1 = LS[["V1", "sector_name"]].drop_duplicates()
dictionary1.rename(columns={"sector_name": "name", "V1": "id"}, inplace=True)
dictionary1["name"] = dictionary1["name"].astype(str)
dictionary1.sort_values(by=["id"]).tail(3)

In [None]:
dictionary2 = LS[["V2", "lender_publicId"]].drop_duplicates()
dictionary2.rename(columns={"lender_publicId": "name", "V2": "id"}, inplace=True)
dictionary2.head(1)

In [None]:
# concat two dictionaries
dictionary = pd.concat([dictionary1, dictionary2])
print(len(dictionary))
assert len(dictionary) == vertex_count
# confirm that dictionary's id is a range
assert dictionary["id"].min() == 0
assert dictionary["id"].max() == vertex_count - 1
assert dictionary.duplicated().sum() == 0
dictionary.tail()

In [None]:
prefix = f"checkpoints/LS_US_active_from{FROM}_to{TO}_order{ORDER}_bipartite"  # the text "bipartite" is mandatory
filename = f"{prefix}.csv"
dictfile = f"{prefix}_Dictionary.txt"  # "_Dictionary.txt" is mandatory
LS[["V1", "V2", "loan_amount"]].to_csv(filename, sep="\t", header=False, index=False)  # "\t" is mandatory
dictionary[["id", "name"]].to_csv(dictfile, sep="\t", header=False, index=False)

# Run `biLouvian`

Note: Have to build the `biLouvian` binary first. Refer to `doc/BUILD_BILOUVIAN.md`

In [None]:
print(f"../extra/biLouvian -i {filename} -order {ORDER}")

In [None]:
!../extra/biLouvain -i $filename -order $ORDER

Read the result

In [None]:
from biLouvian_helper import *

result_mutaraplus(prefix)

In [None]:
prefix

In [None]:
result = result_community(prefix)

In [None]:
len(result.clusters), len(result.coclusters)

In [None]:
import numpy as np

# find all cluster type v1, then concat their member
# remember that result.clusters is a Set

v1 = [list(c.member) for c in list(result.clusters) if c.type == VertexType.V1]
sorted_v1 = np.concatenate(v1)
sorted_v1

In [None]:
v2 = [list(c.member) for c in list(result.clusters) if c.type == VertexType.V2]
sorted_v2 = np.concatenate(v2)
len(sorted_v2)

In [None]:
filename = f"{prefix}.csv"
edge_list = pd.read_csv(filename, names=["V1", "V2", "weight"], sep="\t", header=None)
edge_list.head(1)

In [None]:
dictfile = f"{prefix}_Dictionary.txt"
dictionary = pd.read_csv(dictfile, names=["id", "name"], sep="\t", header=None)
dictionary.head(1)

In [None]:
# replace the V1 and V2 in edges_list with their corresponding name in dictionary
edge_list = edge_list.merge(dictionary, left_on="V1", right_on="id")
edge_list.rename(columns={"name": "V1_name"}, inplace=True)
edge_list = edge_list.merge(dictionary, left_on="V2", right_on="id")
edge_list.rename(columns={"name": "V2_name"}, inplace=True)
edge_list.head(1)

In [None]:
import matplotlib.pyplot as plt

# Get the weights from edge_list
weights = edge_list["weight"]

# Plot the histogram
plt.hist(weights, bins=10, log=True)
plt.xlabel("Weight")
plt.ylabel("Frequency (log scale)")
plt.title("Histogram of Edge Weights")
plt.show()

In [None]:
# create a graph from sorted_v1 and sorted_v2
B = nx.Graph()
B.add_nodes_from(sorted_v1, bipartite=0)  # have to be in order
B.add_nodes_from(sorted_v2, bipartite=1)  # have to be in order
# B.add_weighted_edges_from(edge_list[["V1_name", "V2_name", "weight"]].values.tolist())
B.add_edges_from(edge_list[["V1_name", "V2_name"]].values.tolist())
assert B.number_of_nodes() == len(sorted_v1) + len(sorted_v2)
assert B.number_of_edges() == len(edge_list)
B.number_of_nodes(), B.number_of_edges()

In [None]:
edge_list[["V1_name", "V2_name"]].duplicated().sum()

In [None]:
import numpy as np
from networkx.algorithms import bipartite

# Get the biadjacency matrix of graph B
biadjacency_matrix = bipartite.biadjacency_matrix(B, row_order=sorted_v2, column_order=sorted_v1)

# Convert the matrix to a numpy array
biadjacency_array = biadjacency_matrix.toarray()

# Plot the biadjacency matrix as an image
plt.imshow(biadjacency_array, origin="lower")
plt.xlabel("V2")
plt.ylabel("V1")
plt.title("Biadjacency Matrix")
plt.gca().set_aspect(aspect=0.001)
# plt.colorbar()

# Disable ticks in all axes
plt.tick_params(axis="both", which="both", bottom=False, top=False, left=False, right=False)

# Also disable values in axis
plt.xticks([])
plt.yticks([])

# enable grid
plt.grid(True, which="both", axis="both", linestyle="-", color="w", linewidth=1)

plt.show()