In [None]:
import os

DEVICES = "1,2"
os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import cudf

tqdm.pandas()

In [None]:
import dask_cudf
import dask.dataframe as dd

from dask_cuda import LocalCUDACluster
from dask.distributed import Client, LocalCluster

# only create cuda cluster when using dask_cudf, un comment if needed
cluster = LocalCUDACluster(
    CUDA_VISIBLE_DEVICES=DEVICES, memory_limit="48GiB", device_memory_limit="auto", n_workers=None
)
# cluster = LocalCluster(n_workers=8, memory_limit="48GiB")


client = Client(cluster)
client

# Import raw data
First, read data in `.jsonl` file format as a pandas data frame
Then store the dataframe in `.parquet` format for easy access later

In [None]:
%%script false --no-raise-error

df = pd.read_json("../fulldata/kiva_activity_2023-08-28T11-09-39.jsonl", lines=True)
df = pd.json_normalize(df["loan"], sep='_')

In [None]:
%%script false --no-raise-error

df["loanAmount"] = df["loanAmount"].astype(float)
df["loanFundraisingInfo_fundedAmount"] = df["loanFundraisingInfo_fundedAmount"].astype(float)
df["raisedDate"] = pd.to_datetime(df["raisedDate"])
df["fundraisingDate"] = pd.to_datetime(df["fundraisingDate"])
df["geocode_country_name"] = df["geocode_country_name"].astype("category")
df["sector_id"] = df["sector_id"].astype(int)
df["sector_name"] = df["sector_name"].astype("category")
df["activity_id"] = df["activity_id"].astype(int)
df["activity_name"] = df["activity_name"].astype("category")

In [None]:
%%script false --no-raise-error
df.to_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds = cudf.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds.rename(columns={"id": "project_id", "name": "project_name"}, inplace=True)

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
ds.tail()

Filter data for Vietnam only
Filtering, only take `Vietnam` into account
Why? Because there are a lot of rows and we try to localize the task
ds["geocode_country_name"].value_counts()["Vietnam"]
ds = ds[ds["geocode_country_name"] == "Vietnam"]
Try to limit the timeline, because I am not yet comfortable to work with large data
ds = ds[ds.fundraisingDate > "2020-01-01"]
"the number of Loans (might duplicated) under investigation is", len(ds)
ds

# Remove duplicated `loan`

There are loans which have a same `id` but different `fundedAmount`
It might because the query time is different
Here, only keep records which have the highest `fundedAmount`

In [None]:
ds.loc[[9628, 1366545]]

In [None]:
temp = ds.groupby("id", group_keys=False)[["loanFundraisingInfo_fundedAmount"]].idxmax()
iloc = temp["loanFundraisingInfo_fundedAmount"].values  # NOTE: just iloc, not loc
ds = ds.iloc[iloc]
del iloc
del temp
ds.loc[[9628, 1366545]]  # see, only keep the one with higher fundedAmount

In [None]:
ds[ds.duplicated(subset=["id"], keep=False)].sort_values(by=["id"])  # no duplicated

In [None]:
"the number of Loans (no duplicated) under investigation is", len(ds)

# change data type

In [None]:
ds["sector_name"] = ds["sector_name"].astype("category")
ds["geocode_country_name"] = ds["geocode_country_name"].astype("category")
ds["activity_name"] = ds["activity_name"].astype("category")
ds["sector_id"] = ds["sector_id"].astype("uint8")
ds["activity_id"] = ds["activity_id"].astype("uint8")
ds["lendingActions_totalCount"] = ds["lendingActions_totalCount"].astype("uint8")

In [None]:
assert 0 == ds.index.duplicated().sum()

In [None]:
ds.head()

In [None]:
ds.reset_index(inplace=True, drop=True)

# create `lender-loan-tag` df


## Explode the `lendingActions_values`, then do dict processing

since the explosion will require more memmory, the nature choice is to move forward to dask_cudf.  
But here, we must process the `dict` data type later.  
Hence, just use pandas would be the easiest way

In [None]:
df = ds.to_pandas()  # slow, about 2 minutes

In [None]:
df = df.explode("lendingActions_values")

In [None]:
df.dropna(subset=["lendingActions_values"], inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
df["lender_id"] = df.lendingActions_values.progress_apply(lambda x: x["lender"]["id"]).astype(int)
df["lender_name"] = df.lendingActions_values.progress_apply(lambda x: x["lender"]["name"])
df["lender_publicId"] = df.lendingActions_values.progress_apply(lambda x: x["lender"]["publicId"])
df["loan_shareAmount"] = df.lendingActions_values.progress_apply(lambda x: x["shareAmount"]).astype(float)
df["loan_date"] = pd.to_datetime(df.lendingActions_values.progress_apply(lambda x: x["latestSharePurchaseDate"]))

In [None]:
df.drop(columns=["lendingActions_values"], inplace=True)

Let's create a column call `loan_id`

In [None]:
# df.duplicates(subset=[['project_id', 'lender_id', 'lender_name', 'lender_publicId', 'loan_shareAmount', 'loan_date']])
assert 0 == df.duplicated(subset=["project_id", "lender_id", "loan_shareAmount", "loan_date"]).sum()

In [None]:
df["loan_id"] = np.arange(len(df["lender_id"])) + 1

In [None]:
# cuDF do not work with timezone yet
df["loan_date"] = df["loan_date"].dt.tz_localize(None)

## Now, expode the `tags`

In [None]:
df = df.explode("tags")

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
df.to_parquet("checkpoints/explodeddata.parquet")

In [None]:
df.iloc[0:1000].to_parquet("checkpoints/explodeddata_sample_1000.parquet")

In [None]:
df = pd.read_parquet("checkpoints/explodeddata_sample_1000.parquet")

In [None]:
df.info()

# Basic stats

In [None]:
df = pd.read_parquet("checkpoints/explodeddata.parquet")

In [None]:
# number of Projects
df.project_id.nunique()

In [None]:
# number of Lenders
df.lender_id.nunique()

how many project that doesn't have tags?

In [None]:
project_tags_df = df[["project_id", "fundraisingDate", "tags"]].drop_duplicates()
print(len(project_tags_df))

In [None]:
# same result
monthly_grouper = pd.Grouper(key="fundraisingDate", freq="30d")
perMonth = project_tags_df.groupby(monthly_grouper).agg({"project_id": "nunique", "tags": "count"})
perMonth.rename(columns={"project_id": "project_count", "tags": "tag_count"}, inplace=True)

In [None]:
perMonth.head()

In [None]:
perMonth.plot.line()

In [None]:
project_empty_tags = project_tags_df.groupby("project_id").tags.count() == 0
project_empty_tags = project_empty_tags.rename("is_empty_tags")

In [None]:
project_empty_tags = (
    project_tags_df.drop(columns=["tags"])
    .drop_duplicates()
    .merge(project_empty_tags, left_on="project_id", right_index=True)
)

In [None]:
empty_per_month = project_empty_tags.groupby(monthly_grouper).agg({"project_id": "nunique", "is_empty_tags": "sum"})
empty_per_month.rename(columns={"project_id": "count_project", "is_empty_tags": "count_project_no_tag"}, inplace=True)

In [None]:
empty_per_month["percentage_no_tag"] = empty_per_month["count_project_no_tag"] / empty_per_month["count_project"] * 100
empty_per_month

In [None]:
empty_per_month[["percentage_no_tag"]].plot.line()

we could say that before 2013, projects do not have tags. Tags appeared since 2013.

# Tag preprocessing without dask

Here, we just using pandas, because `cudf` cannot load the big parquet file  
Becareful, need 3 hours to do the deduplication

In [None]:
ds = pd.read_parquet("checkpoints/explodeddata.parquet")
# ds = cudf.read_parquet("checkpoints/explodeddata.parquet")
# ds = cudf.read_parquet("checkpoints/explodeddata_sample_1000.parquet")

In [None]:
# create a tag call `empty`
ds["tags"] = ds["tags"].fillna("empty")

In [None]:
ds["tags"] = ds.tags.replace(["user_favorite", "user_like", "volunteer_like", "volunteer_pick"], ["removetag"] * 4)

In [None]:
def remove_duplicates(partition):
    return partition.drop_duplicates(subset=["project_id", "tags", "lender_id", "loan_shareAmount", "loan_date"])


ds = ds.groupby("project_id").apply(remove_duplicates)

In [None]:
ds.reset_index(drop=True, inplace=True)

In [None]:
one_tag_loans = ds.groupby("project_id").tags.transform("nunique") == 1
one_tag_loans = one_tag_loans.rename("is_single_tag")

In [None]:
ds = ds.merge(one_tag_loans.to_frame(), left_index=True, right_index=True)

In [None]:
ds[(ds["is_single_tag"] == True) & (ds["tags"] == "removetag")]["tags"] = "empty"

In [None]:
ds = ds[ds.tags != "removetag"]

In [None]:
ds["tags"] = ds["tags"].astype("category").cat.as_ordered()
ds["tags"].dtype

In [None]:
is_anon_1 = ds.lender_name.str.lower().str.startswith("anonymous")
is_anon_2 = ds.lender_publicId.fillna("").str.startswith("anon")
anons = is_anon_1 | is_anon_2
print("anynomous Lenders")
# print some annons
ds[anons][["lender_id", "lender_name", "lender_publicId"]].head(n=2)

In [None]:
# how many anons out there?
anons.sum()

In [None]:
# drop those anon
ds = ds[~anons]

In [None]:
ds.to_parquet("checkpoints/preprocessed_2023-08-28T11-09-39.parquet")

In [None]:
pd.read_parquet("checkpoints/preprocessed_2023-08-28T11-09-39.parquet").head()

# Tag preprocessing with dask

Why? BEcause without dask, the `drop_duplicates` could not work in the whole dataset 
even when using `subset=['project_id', 'lender_id', 'tags', 'shareAmount', 'date']`

In [None]:
ddf = dd.read_parquet("checkpoints/explodeddata.parquet")
# ddf = dd.read_parquet("checkpoints/explodeddata_sample_1000.parquet")
ddf = ddf.set_index("project_id", drop=False)  # why? To easier for duplication
ddf = ddf.repartition(npartitions=4)
print(ddf.npartitions)
# ddf = dask_cudf.from_dask_dataframe(ddf)
# print(ddf.npartitions)

fill all project which have no tag by tag `empty`

In [None]:
# create a tag call `empty`
ddf["tags"] = ddf["tags"].fillna("empty")

Remove some tags 
The folowing tags should be remove, because it isn't visible to Users:  
- `user_favorite`
- `user_like`
- `volunteer_like`
- `volunteer_pick`

If a project **only** have those tags, change all those tags into `empty`, then remove duplicate again.  
If a project have other tags rather than those tags, just drop those tags

In [None]:
# (dds["tags"] == "user_favorite").sum().compute(),\
# (dds["tags"] == "user_like").sum().compute(),\
# (dds["tags"] == "volunteer_like").sum().compute(),\
# (dds["tags"] == "volunteer_pick").sum().compute()

In [None]:
ddf["tags"] = ddf.tags.replace(["user_favorite", "user_like", "volunteer_like", "volunteer_pick"], ["removetag"] * 4)

after turn those tags into `removetag`, Threre will be duplicates. Now remove them here.
but turn out, the removal require a lot of memory. Hence, dask

In [None]:
def remove_duplicates(partition):
    return partition.drop_duplicates(subset=["project_id", "tags", "lender_id", "loan_shareAmount", "loan_date"])


df_1 = ddf.map_partitions(remove_duplicates)

In [None]:
df_1 = df_1.reset_index(drop=True)

In [None]:
# this is why we need to use dask
# df_1 = df.drop_duplicates(subset=['project_id', 'tags', 'lender_id', 'shareAmount', 'date'], split_out=4)
# df_1 = ddf.drop_duplicates(subset=["project_id", "tags", "lender_id", "loan_shareAmount", "loan_date"])
# df_1 = client.persist(df_1)
# df_1 = df_1.repartition(npartitions=4)

In [None]:
one_tag_loans = df_1.groupby("project_id").tags.transform("nunique", meta=("tags", "int")) == 1
one_tag_loans = one_tag_loans.rename("is_single_tag")

In [None]:
df_2 = df_1.merge(one_tag_loans.to_frame(), left_index=True, right_index=True)

In [None]:
df_2[(df_2["is_single_tag"] == True) & (df_2["tags"] == "removetag")]["tags"] = "empty"

In [None]:
%%script false --no-raise-error

# count tags by projects (except tag nan, if exists)
# in cudf, use nunique(drop=False)
# dask do not support that `drop` yet
# fortunately, the colummn tags contain no null value
# one can confirm with `ds_2.tags.isna().sum().compute()`

one_tag_loans = df_2.groupby("project_id").tags.nunique() == 1  # count tags, except
one_tag_loans = one_tag_loans[one_tag_loans]

In [None]:
%%script false --no-raise-error

one_tag_loans = one_tag_loans.index.compute()

In [None]:
%%script false --no-raise-error

should_change_tag = (df_2["project_id"].isin(one_tag_loans)) & (df_2["tags"] == "removetag")

In [None]:
%%script false --no-raise-error

df_2[should_change_tag].tags = "empty"

In [None]:
df_3 = df_2[df_2.tags != "removetag"]

remove no-tag Projects

In [None]:
df_4 = df_3[df_3.tags != "empty"]

convert to `category`

In [None]:
df_3["tags"] = df_3["tags"].astype("category").cat.as_ordered()
df_3["tags"].dtype

## Remove anonymous Lenders

A lender is call *Annonymous* when:
- `lender_name` (lower form) starts with "anonymous"
- Or, `lender_publicId` starts with "anon"

In [None]:
is_anon_1 = df_4.lender_name.str.lower().str.startswith("anonymous")
is_anon_2 = df_4.lender_publicId.fillna("").str.startswith("anon")
anons = is_anon_1 | is_anon_2
print("anynomous Lenders")
# anons = df_2.lender_publicId.fillna("").str.startswith("anon")
# print some annons
# df_4[anons][["lender_id", "lender_name", "lender_publicId"]].head(n=2)

In [None]:
# drop those anon
df_5 = df_4[~anons]

In [None]:
# no need to deduplicate one more time
# df_6 = df_5.drop_duplicates(subset=["project_id", "tags", "lender_id", "loan_shareAmount", "loan_date"])

In [None]:
# df_7 = df_6.reset_index(drop=True)
df_7 = df_5

save the data

In [None]:
df_7.to_parquet("checkpoints/preprocessed_2023-08-28T11-09-39_parquet", index=False)

In [None]:
df_7.dask

In [None]:
df_7.visualize()

In [None]:
# ads = cudf.read_parquet("checkpoints/vn_since_20200101.parquet")
# ads["sector_name"] = ads["sector_name"].astype("category")
# ads["geocode_country_name"] = ads["geocode_country_name"].astype("category")
# ads["activity_name"] = ads["activity_name"].astype("category")
# ads["tags"] = ads["tags"].astype("category")

In [None]:
ds.loc[[9628, 1366545]]

# Filter data for Vietnam only
Filtering, only take `Vietnam` into account
Why? Because there are a lot of rows and we try to localize the task

In [None]:
vn = df_7[df_7["geocode_country_name"] == "Vietnam"]
# Try to limit the timeline, because I am not yet comfortable to work with large data

In [None]:
vn_since_2018 = vn[vn.fundraisingDate >= "2018-01-01"]

In [None]:
vn_since_2018.compute().to_parquet("checkpoints/vn_since_20180101.parquet")

In [None]:
# ds["geocode_country_name"].value_counts()["Vietnam"]
# ds = ds[ds["geocode_country_name"] == "Vietnam"]
# # Try to limit the timeline, because I am not yet comfortable to work with large data
# ds = ds[ds.fundraisingDate > "2020-01-01"]
# "the number of Loans (might duplicated) under investigation is", len(ds)
# ds

In [None]:
cudf.read_parquet("checkpoints/vn_since_20180101.parquet")