Export Project-Tag graph with optional date

This notebook can be run with `papermill`

```bash
papermill 21_build_graph_PT.ipynb checkpoints/21_build_graph_PT_20200101.ipynb -p SINCE "2020-01-01"
```

In [None]:
SINCE = "2013-01-01"  ## only take the data with fundrasingDate >= SINCE. Format: yyyy-mm-dd
DEVICES = "0,1"  # GPU devices to use. Format: "0,1,2,3"

In [None]:
SINCE.replace("-", "")

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import cupy as cp
import pandas as pd
from tqdm import tqdm
import cudf
import networkx as nx


tqdm.pandas()

# Import raw data

In [None]:
ds = cudf.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
ds.tail()

# Filter data from $SINCE$

Try to limit the timeline, because I am not yet comfortable to work with large data

In [None]:
ds = ds[ds.fundraisingDate >= SINCE]
"the number of Projects (might duplicated) under investigation is", len(ds)

In [None]:
ds.tail()

# Basic process

In [None]:
ds.rename(columns={"id": "project_id", "name": "project_name"}, inplace=True)

In [None]:
ds.drop(columns=["tags", "lendingActions_values"]).info()

In [None]:
ds["sector_name"] = ds["sector_name"].astype("category")
ds["geocode_country_name"] = ds["geocode_country_name"].astype("category")
ds["activity_name"] = ds["activity_name"].astype("category")

# Preprocessing

## Remove duplicated Projects

There are Projects which have a same `project_id` but different `fundedAmount`
It might because the query time is different
Here, only keep records which have the highest `fundedAmount`

In [None]:
ds.loc[[9628, 1366545]]

In [None]:
temp = ds.groupby("project_id", group_keys=False)[["loanFundraisingInfo_fundedAmount"]].idxmax()
iloc = temp["loanFundraisingInfo_fundedAmount"].values  # NOTE: just iloc, not loc
ds = ds.iloc[iloc]
del iloc
del temp
ds.loc[[9628, 1366545]]  # see, only keep the one with higher fundedAmount

In [None]:
assert 0 == len(ds[ds.duplicated(subset=["project_id"], keep=False)].sort_values(by=["project_id"]))  # no duplicated

In [None]:
"the number of Projects (no duplicated) under investigation is", len(ds)

## create `Project-Tag` df

In [None]:
ds.drop(columns=["lendingActions_totalCount", "lendingActions_values"], inplace=True)

In [None]:
ads = ds.explode("tags")
len(ads)

## Tag preprocessing

In [None]:
# there are many Loans that do not have tags
ads[ads["tags"].isna()].project_id.unique().count(), "~", ads[
    ads["tags"].isna()
].project_id.unique().count() / ads.project_id.unique().count() * 100, "percent"

In [None]:
(ads["tags"].str.strip() == "").sum()

In [None]:
ads.loc[ads["tags"].str.strip() == "", "tags"] = "empty"

In [None]:
# create a tag call `empty`
# ads.dropna(subset=["tags"], inplace=True)
ads[["tags"]].fillna("empty", inplace=True)

Remove some tags 
The folowing tags should be remove, because it isn't visible to Users:  
- `user_favorite`
- `user_like`
- `volunteer_like`
- `volunteer_pick`

If a project **only** have those tags, change all those tags into `empty`, then remove duplicate again.  
If a project have other tags rather than those tags, just drop those tags

In [None]:
(ads["tags"] == "user_favorite").sum(), (ads["tags"] == "user_like").sum(), (ads["tags"] == "volunteer_like").sum(), (
    ads["tags"] == "volunteer_pick"
).sum()

In [None]:
ads.duplicated().sum()  # NOTE: only work with small dataset

In [None]:
ads["tags"] = ads.tags.replace(["user_favorite", "user_like", "volunteer_like", "volunteer_pick"], ["removetag"] * 4)

In [None]:
ads.drop_duplicates(inplace=True)  # NOTE: only work with small dataset

In [None]:
# count tags by loans
# NOTE: this method only support small size data, consider using `transform` instead
# NOTE: also note that, `transform('nunique')` might not work with cudf yet
one_tag_loans = ads.groupby("project_id").tags.nunique(dropna=False) == 1
one_tag_loans = one_tag_loans[one_tag_loans]

In [None]:
should_change_tag = (ads["project_id"].isin(one_tag_loans.index)) & (ads["tags"] == "removetag")
ads[should_change_tag]

In [None]:
ads[should_change_tag].tags = "empty"

In [None]:
ads = ads[ads.tags != "removetag"]

In [None]:
ads["tags"] = ads["tags"].astype("category").cat.as_ordered()
ads["tags"].dtype

In [None]:
# what is the portion of Loans that have no tags?
ads[ads.tags == "empty"].project_id.nunique(), ads.project_id.nunique(), ads[
    ads.tags == "empty"
].project_id.nunique() / ads.project_id.nunique()

There are $\sim 27\%$ of no-tag Loans, just remove them

In [None]:
ads = ads[ads.tags != "empty"]

In [None]:
ads.head()

# Contruct a Graph

## create `Tag` nodes

In [None]:
# create those df
ds_tags = ads[["tags"]].drop_duplicates().dropna()
ds_tags[":LABEL"] = "Tag"
# save in neo4j style
ds_tags.rename(columns={"tags": "name:ID"}).to_csv("../data/gen/tags_20130101.csv", index=False)
print(len(ds_tags))
ds_tags.head()

## create `Project` nodes

In [None]:
ads.head()

In [None]:
ds_loan = ads.drop(columns=["tags"]).drop_duplicates()
len(ds_loan)

In [None]:
ds_loan[":LABEL"] = "Project"
ds_loan.rename(columns={"project_id": "id:ID(Project-ID)"}).to_csv(
    f"../data/gen/projects_{SINCE.replace('-', '')}.csv", index=False
)
ds_loan.head(5)

## Create `TAGGED_WITH` relationship between `Project` and `Tags`

In [None]:
ds_loan_tags = ads[["project_id", "tags"]].dropna()
ds_loan_tags.isna().sum()

In [None]:
ds_loan_tags.drop_duplicates(inplace=True)
ds_loan_tags.duplicated().sum()

In [None]:
"the number of projects is ", len(ds_loan_tags["project_id"].drop_duplicates())

In [None]:
"the number of project-tag relationships is", len(ds_loan_tags)

In [None]:
ds_loan_tags["tags"].value_counts()

In [None]:
ds_loan_tags[":TYPE"] = "TAGGED_WITH"
ds_loan_tags.rename(columns={"id": ":START_ID(Project-ID)", "tags": ":END_ID"}).to_csv(
    f"../data/gen/project_tags_{SINCE.replace('-', '')}.csv", index=False
)
print(len(ds_loan_tags))
ds_loan_tags.head()