In [None]:
# parameter cell
DEVICES = "0"
FROM = "2023-01-01"
TO = "2024-01-01"
ORDER = 1  # will be pass to param order in biLouvian call

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import re
import cupy as cp
import pandas as pd
from tqdm import tqdm
import cudf
import networkx as nx
from cuml.preprocessing import LabelEncoder

tqdm.pandas()

# Import raw data and filter by the Paramteres

In [None]:
ds = cudf.read_parquet("../data/gen/preprocessed_2023-08-28T11-09-39_from_2019-01-01_activelender.parquet")
ds["sector_name"] = ds["sector_name"].astype("category")
ds["geocode_country_name"] = ds["geocode_country_name"].astype("category")
ds["activity_name"] = ds["activity_name"].astype("category")

In [None]:
# remove 'empty' tags, which should be done in preprocessing :(
# drop rows that columns tags is equal to "empty" or ""
ds = ds[(ds["tags"] != "empty") & (ds["tags"] != "")]
ds["tags"] = ds["tags"].astype("category")

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
print(len(ds))
ds.tail(2)

Try to limit the timeline, because I am not yet comfortable to work with large data

In [None]:
ds = ds[(ds.fundraisingDate >= FROM) & (ds.fundraisingDate < TO)]
"the number of Loans (might duplicated) under investigation is", len(ds)

In [None]:
assert 0 == ds.duplicated(subset=["project_id", "tags", "lender_id", "loan_id"]).sum()

# Stats

In [None]:
"number of Loans", ds.project_id.unique().count()

In [None]:
"number of Lenders", ds.lender_id.unique().count()

# Contruct `Lender-Tag` bipartite graph

## Create edge list of the graph

In [None]:
LT = ds.groupby(["lender_id", "tags"]).agg(
    {"lender_publicId": "first", "loan_shareAmount": "sum", "loan_date": "count"}
)
LT.reset_index(inplace=True)
LT.rename(columns={"loan_shareAmount": "loan_amount", "loan_date": "loan_count"}, inplace=True)
LT = LT[LT["loan_amount"] > 0]
LT

## Convert to the format that `biLouvian` can understand

In [None]:
LT["tags"] = LT["tags"].astype("object")
le1 = LabelEncoder()
LT["V1"] = le1.fit_transform(LT["tags"])
le2 = LabelEncoder()
LT["V2"] = le2.fit_transform(LT["lender_id"]) + LT["V1"].max() + 1
LT.tail(3)

In [None]:
# total number of vertex
vertex_count = LT["lender_id"].nunique() + LT["tags"].nunique()
assert vertex_count == LT["V1"].nunique() + LT["V2"].nunique()
print(vertex_count)

In [None]:
dictionary1 = LT[["V1", "tags"]].drop_duplicates()
dictionary1.rename(columns={"tags": "name", "V1": "id"}, inplace=True)
dictionary1["name"] = dictionary1["name"].astype(str)
dictionary1.sort_values(by=["id"]).tail(3)

In [None]:
dictionary2 = LT[["V2", "lender_publicId"]].drop_duplicates()
dictionary2.rename(columns={"lender_publicId": "name", "V2": "id"}, inplace=True)
dictionary2.head(1)

In [None]:
# concat two dictionaries
dictionary = cudf.concat([dictionary1, dictionary2])
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

In [None]:
prefix = f"checkpoints/LT_bipartite_active_from{FROM}_to{TO}"  # the text "bipartite" is mandatory
filename = f"{prefix}.csv"
dictfile = f"{prefix}_Dictionary.txt"  # "_Dictionary.txt" is mandatory
LT[["V1", "V2", "loan_amount"]].to_csv(filename, sep="\t", header=False, index=False)  # "\t" is mandatory
dictionary[["id", "name"]].to_csv(dictfile, sep="\t", header=False, index=False)

# Run `biLouvian`

Note: Have to build the `biLouvian` binary first. Refer to `doc/BUILD_BILOUVIAN.md`

In [None]:
!../extra/biLouvian -i $filename -order $ORDER