In [None]:
import os

DEVICES = "1,2"
os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import cudf

tqdm.pandas()

# Import raw data
First, read data in `.jsonl` file format as a pandas data frame
Then store the dataframe in `.parquet` format for easy access later

In [None]:
ds = cudf.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds.rename(columns={"id": "project_id", "name": "project_name"}, inplace=True)

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
ds.tail()

Filter data for Vietnam only
Filtering, only take `Vietnam` into account
Why? Because there are a lot of rows and we try to localize the task
ds["geocode_country_name"].value_counts()["Vietnam"]
ds = ds[ds["geocode_country_name"] == "Vietnam"]
Try to limit the timeline, because I am not yet comfortable to work with large data
ds = ds[ds.fundraisingDate > "2020-01-01"]
"the number of Loans (might duplicated) under investigation is", len(ds)
ds

# Remove duplicated `loan`

There are loans which have a same `id` but different `fundedAmount`
It might because the query time is different
Here, only keep records which have the highest `fundedAmount`

In [None]:
ds.loc[[9628, 1366545]]

In [None]:
temp = ds.groupby("id", group_keys=False)[["loanFundraisingInfo_fundedAmount"]].idxmax()
iloc = temp["loanFundraisingInfo_fundedAmount"].values  # NOTE: just iloc, not loc
ds = ds.iloc[iloc]
del iloc
del temp
ds.loc[[9628, 1366545]]  # see, only keep the one with higher fundedAmount

In [None]:
ds[ds.duplicated(subset=["id"], keep=False)].sort_values(by=["id"])  # no duplicated

In [None]:
"the number of Loans (no duplicated) under investigation is", len(ds)

# change data type

In [None]:
ds["sector_name"] = ds["sector_name"].astype("category")
ds["geocode_country_name"] = ds["geocode_country_name"].astype("category")
ds["activity_name"] = ds["activity_name"].astype("category")
ds["sector_id"] = ds["sector_id"].astype("uint8")
ds["activity_id"] = ds["activity_id"].astype("uint8")
ds["lendingActions_totalCount"] = ds["lendingActions_totalCount"].astype("uint8")

In [None]:
assert 0 == ds.index.duplicated().sum()

In [None]:
ds.head()

In [None]:
ds.reset_index(inplace=True, drop=True)

# create `lender-loan-tag` df


## Explode the `lendingActions_values`, then do dict processing

since the explosion will require more memmory, the nature choice is to move forward to dask_cudf.  
But here, we must process the `dict` data type later.  
Hence, just use pandas would be the easiest way

In [None]:
df = ds.to_pandas()  # slow, about 2 minutes

In [None]:
df = df.explode("lendingActions_values")

In [None]:
df.dropna(subset=["lendingActions_values"], inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
df["lender_id"] = df.lendingActions_values.progress_apply(lambda x: x["lender"]["id"]).astype(int)
df["lender_name"] = df.lendingActions_values.progress_apply(lambda x: x["lender"]["name"])
df["lender_publicId"] = df.lendingActions_values.progress_apply(lambda x: x["lender"]["publicId"])
df["loan_shareAmount"] = df.lendingActions_values.progress_apply(lambda x: x["shareAmount"]).astype(float)
df["loan_date"] = pd.to_datetime(df.lendingActions_values.progress_apply(lambda x: x["latestSharePurchaseDate"]))

In [None]:
df.drop(columns=["lendingActions_values"], inplace=True)

Let's create a column call `loan_id`

In [None]:
# df.duplicates(subset=[['project_id', 'lender_id', 'lender_name', 'lender_publicId', 'loan_shareAmount', 'loan_date']])
assert 0 == df.duplicated(subset=["project_id", "lender_id", "loan_shareAmount", "loan_date"]).sum()

In [None]:
df["loan_id"] = np.arange(len(df["lender_id"])) + 1

In [None]:
# cuDF do not work with timezone yet
df["loan_date"] = df["loan_date"].dt.tz_localize(None)

## Now, expode the `tags`

In [None]:
df = df.explode("tags")

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
df.to_parquet("checkpoints/explodeddata.parquet")

# Basic stats

In [None]:
df = pd.read_parquet("checkpoints/explodeddata.parquet")

In [None]:
# number of Projects
df.project_id.nunique()

In [None]:
# number of Lenders
df.lender_id.nunique()

how many project that doesn't have tags?

In [None]:
project_tags_df = df[["project_id", "fundraisingDate", "tags"]].drop_duplicates()
print(len(project_tags_df))

In [None]:
# same result
monthly_grouper = pd.Grouper(key="fundraisingDate", freq="30d")
perMonth = project_tags_df.groupby(monthly_grouper).agg({"project_id": "nunique", "tags": "count"})
perMonth.rename(columns={"project_id": "project_count", "tags": "tag_count"}, inplace=True)

In [None]:
perMonth.head()

In [None]:
perMonth.plot.line()

In [None]:
project_empty_tags = project_tags_df.groupby("project_id").tags.count() == 0
project_empty_tags = project_empty_tags.rename("is_empty_tags")

In [None]:
project_empty_tags = (
    project_tags_df.drop(columns=["tags"])
    .drop_duplicates()
    .merge(project_empty_tags, left_on="project_id", right_index=True)
)

In [None]:
empty_per_month = project_empty_tags.groupby(monthly_grouper).agg({"project_id": "nunique", "is_empty_tags": "sum"})
empty_per_month.rename(columns={"project_id": "count_project", "is_empty_tags": "count_project_no_tag"}, inplace=True)

In [None]:
empty_per_month["percentage_no_tag"] = empty_per_month["count_project_no_tag"] / empty_per_month["count_project"] * 100
empty_per_month

In [None]:
empty_per_month[["percentage_no_tag"]].plot.line()

we could say that before 2013, projects do not have tags. Tags appeared since 2013.

# Tag preprocessing without dask

Here, we just using pandas, because `cudf` cannot load the big parquet file  
Becareful, need 3 hours to do the deduplication

In [None]:
ds = pd.read_parquet("checkpoints/explodeddata.parquet")
# ds = cudf.read_parquet("checkpoints/explodeddata.parquet")
# ds = cudf.read_parquet("checkpoints/explodeddata_sample_1000.parquet")

In [None]:
# create a tag call `empty`
ds["tags"] = ds["tags"].fillna("empty")

In [None]:
ds["tags"] = ds.tags.replace(["user_favorite", "user_like", "volunteer_like", "volunteer_pick"], ["removetag"] * 4)

In [None]:
def remove_duplicates(partition):
    return partition.drop_duplicates(subset=["project_id", "tags", "lender_id", "loan_shareAmount", "loan_date"])


ds = ds.groupby("project_id").apply(remove_duplicates)

In [None]:
ds.reset_index(drop=True, inplace=True)

In [None]:
one_tag_loans = ds.groupby("project_id").tags.transform("nunique") == 1
one_tag_loans = one_tag_loans.rename("is_single_tag")

In [None]:
ds = ds.merge(one_tag_loans.to_frame(), left_index=True, right_index=True)

In [None]:
ds[(ds["is_single_tag"] == True) & (ds["tags"] == "removetag")]["tags"] = "empty"

In [None]:
ds = ds[ds.tags != "removetag"]

In [None]:
ds["tags"] = ds["tags"].astype("category").cat.as_ordered()
ds["tags"].dtype

In [None]:
is_anon_1 = ds.lender_name.str.lower().str.startswith("anonymous")
is_anon_2 = ds.lender_publicId.fillna("").str.startswith("anon")
anons = is_anon_1 | is_anon_2
print("anynomous Lenders")
# print some annons
ds[anons][["lender_id", "lender_name", "lender_publicId"]].head(n=2)

In [None]:
# how many anons out there?
anons.sum()

In [None]:
# drop those anon
ds = ds[~anons]

In [None]:
ds.to_parquet("../data/gen/preprocessed_2023-08-28T11-09-39.parquet")

In [None]:
df = pd.read_parquet("../data/gen/preprocessed_2023-08-28T11-09-39.parquet")
print(len(df))
df.head()

In [None]:
vn = df[df["geocode_country_name"] == "Vietnam"]

In [None]:
vn_since_2020 = vn[vn.fundraisingDate > "2020-01-01"]

In [None]:
vn_since_2020 = vn_since_2020[vn_since_2020.tags != "empty"]

In [None]:
vn_since_2020.project_id.nunique()

In [None]:
vn_since_2020.tags.nunique()

In [None]:
vn_since_2020.lender_id.nunique()