In [None]:
# parameter cell
DEVICES = "1,2"
FROM = "2019-01-01"
TO = "2020-01-01"
COUNTRY = "Vietnam"
ORDER = 1  # will be pass to param order in biLouvian call

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import re
import cupy as cp
import pandas as pd
from tqdm import tqdm
import cudf
import networkx as nx
from cuml.preprocessing import LabelEncoder

tqdm.pandas()

# Import raw data and filter by the Paramteres

In [None]:
ds = cudf.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")
ds["sector_name"] = ds["sector_name"].astype("category")
ds["geocode_country_name"] = ds["geocode_country_name"].astype("category")
ds["activity_name"] = ds["activity_name"].astype("category")

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
print(len(ds))
ds.tail(2)

In [None]:
if COUNTRY.lower() != "all":
    ds = ds[ds["geocode_country_name"] == COUNTRY]
len(ds)

Try to limit the timeline, because I am not yet comfortable to work with large data

In [None]:
ds = ds[(ds.fundraisingDate >= FROM) & (ds.fundraisingDate < TO)]
"the number of Loans (might duplicated) under investigation is", len(ds)

# Basic process

In [None]:
ds.rename(columns={"id": "project_id", "name": "project_name"}, inplace=True)

# Preprocessing

## Remove duplicated Projects

There are Projects which have a same `project_id` but different `fundedAmount`
It might because the query time is different
Here, only keep records which have the highest `fundedAmount`

In [None]:
temp = ds.groupby("project_id", group_keys=False)[["loanFundraisingInfo_fundedAmount"]].idxmax()
iloc = temp["loanFundraisingInfo_fundedAmount"].values  # NOTE: just iloc, not loc
ds = ds.iloc[iloc]
del iloc
del temp

In [None]:
assert 0 == len(ds[ds.duplicated(subset=["project_id"], keep=False)].sort_values(by=["project_id"]))  # no duplicated

In [None]:
"the number of Projects (no duplicated) under investigation is", len(ds)

## create `Lender-Project-Tag` df

In [None]:
ads = ds.explode("lendingActions_values")
len(ads)

In [None]:
# drop some loans that has no lender
ads.dropna(subset=["lendingActions_values"], inplace=True)

In [None]:
# dict processing cant be done in cuDF, so convert to pandas
adf = ads.to_pandas()

In [None]:
adf["lender_id"] = adf.progress_apply(lambda x: x["lendingActions_values"]["lender"]["id"], axis=1).astype(int)
adf["lender_name"] = adf.progress_apply(lambda x: x["lendingActions_values"]["lender"]["name"], axis=1)
adf["lender_publicId"] = adf.progress_apply(lambda x: x["lendingActions_values"]["lender"]["publicId"], axis=1)
adf["loan_shareAmount"] = adf.progress_apply(lambda x: x["lendingActions_values"]["shareAmount"], axis=1).astype(float)
adf["loan_date"] = pd.to_datetime(
    adf.progress_apply(lambda x: x["lendingActions_values"]["latestSharePurchaseDate"], axis=1)
)

In [None]:
# cuDF do not work with timezone yet
adf["loan_date"] = adf["loan_date"].dt.tz_localize(None)

In [None]:
adf.drop(["lendingActions_values"], axis=1, inplace=True)

In [None]:
ads = cudf.from_pandas(adf)
del adf

In [None]:
assert 0 == ads.duplicated(subset=["project_id", "lender_id", "loan_shareAmount", "loan_date"]).sum()

In [None]:
ads["loan_id"] = cp.arange(len(ads["lender_id"])) + 1

explode `tags`

In [None]:
ads = ads.explode("tags")

## Tag preprocessing

In [None]:
# there are many Loans that do not have tags
ads[ads["tags"].isna()].project_id.unique().count(), "~", ads[
    ads["tags"].isna()
].project_id.unique().count() / ads.project_id.unique().count() * 100, "percent"

In [None]:
# create a tag call `empty`
# ads.dropna(subset=["tags"], inplace=True)
ads[["tags"]].fillna("empty", inplace=True)

Remove some tags 
The folowing tags should be remove, because it isn't visible to Users:  
- `user_favorite`
- `user_like`
- `volunteer_like`
- `volunteer_pick`

If a project **only** have those tags, change all those tags into `empty`, then remove duplicate again.  
If a project have other tags rather than those tags, just drop those tags

In [None]:
(ads["tags"] == "user_favorite").sum(), (ads["tags"] == "user_like").sum(), (ads["tags"] == "volunteer_like").sum(), (
    ads["tags"] == "volunteer_pick"
).sum()

In [None]:
ads.duplicated().sum()  # NOTE: only work with small dataset

In [None]:
ads["tags"] = ads.tags.replace(["user_favorite", "user_like", "volunteer_like", "volunteer_pick"], ["removetag"] * 4)

In [None]:
ads.drop_duplicates(inplace=True)  # NOTE: only work with small dataset

In [None]:
# count tags by loans
# NOTE: this method only support small size data, consider using `transform` instead
# NOTE: also note that, `transform('nunique')` might not work with cudf yet
one_tag_loans = ads.groupby("project_id").tags.nunique(dropna=False) == 1
one_tag_loans = one_tag_loans[one_tag_loans]

In [None]:
should_change_tag = (ads["project_id"].isin(one_tag_loans.index)) & (ads["tags"] == "removetag")
ads[should_change_tag]

In [None]:
ads[should_change_tag].tags = "empty"

In [None]:
ads = ads[ads.tags != "removetag"]

In [None]:
ads["tags"] = ads["tags"].astype("category").cat.as_ordered()
ads["tags"].dtype

In [None]:
# what is the portion of Loans that have no tags?
ads[ads.tags == "empty"].project_id.nunique(), ads.project_id.nunique(), ads[
    ads.tags == "empty"
].project_id.nunique() / ads.project_id.nunique()

just remove no-tag Loans

In [None]:
ads = ads[ads.tags != "empty"]

## Remove anonymous Lenders

In [None]:
print("anynomous Lenders")
anons = ads.lender_publicId.str.startswith("anon")
ads[anons][["lender_id", "lender_name", "lender_publicId"]].drop_duplicates()

In [None]:
# drop those anon
ads = ads[~anons]

In [None]:
ads.drop_duplicates(inplace=True)

In [None]:
ads.reset_index(inplace=True, drop=True)

# Stats

In [None]:
"number of Loans", ads.project_id.unique().count()

In [None]:
"number of Lenders", ads.lender_id.unique().count()

# Contruct `Lender-Tag` bipartite graph

## Create edge list of the graph

In [None]:
LT = ads.groupby(["lender_id", "tags"]).agg(
    {"lender_publicId": "first", "loan_shareAmount": "sum", "loan_date": "count"}
)
LT.reset_index(inplace=True)
LT.rename(columns={"loan_shareAmount": "loan_amount", "loan_date": "loan_count"}, inplace=True)
LT = LT[LT["loan_amount"] > 0]
LT

## Convert to the format that `biLouvian` can understand

In [None]:
le1 = LabelEncoder()
LT["V1"] = le1.fit_transform(LT["tags"])
le2 = LabelEncoder()
LT["V2"] = le2.fit_transform(LT["lender_id"]) + LT["V1"].max() + 1
LT.tail(3)

In [None]:
# total number of vertex
vertex_count = LT["lender_id"].nunique() + LT["tags"].nunique()
assert vertex_count == LT["V1"].nunique() + LT["V2"].nunique()
print(vertex_count)

In [None]:
dictionary1 = LT[["V1", "tags"]].drop_duplicates()
dictionary1.rename(columns={"tags": "name", "V1": "id"}, inplace=True)
dictionary1["name"] = dictionary1["name"].astype(str)
dictionary1.sort_values(by=["id"]).tail(3)

In [None]:
dictionary2 = LT[["V2", "lender_publicId"]].drop_duplicates()
dictionary2.rename(columns={"lender_publicId": "name", "V2": "id"}, inplace=True)
dictionary2.head(1)

In [None]:
# concat two dictionaries
dictionary = cudf.concat([dictionary1, dictionary2])
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

In [None]:
prefix = f"checkpoints/LT_bipartite_{COUNTRY}_from{FROM}_to{TO}"  # the text "bipartite" is mandatory
filename = f"{prefix}.csv"
dictfile = f"{prefix}_Dictionary.txt"  # "_Dictionary.txt" is mandatory
LT[["V1", "V2", "loan_amount"]].to_csv(filename, sep="\t", header=False, index=False)  # "\t" is mandatory
dictionary[["id", "name"]].to_csv(dictfile, sep="\t", header=False, index=False)

# Run `biLouvian`

Note: Have to build the `biLouvian` binary first. Refer to `doc/BUILD_BILOUVIAN.md`

In [None]:
!../extra/biLouvian -i $filename -order $ORDER