In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1,3"

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="1,3")
client = Client(cluster)
client

In [None]:
import pandas as pd
import cudf
import dask_cudf

# Import raw data

In [None]:
ds = cudf.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds.loc[2547762]

In [None]:
ds.dropna(axis=0, how="all", inplace=True)

In [None]:
ds.columns

In [None]:
ds[['id']] = ds[['id']].astype('uint32')
ds[['name']] = ds[['name']].astype(str)
ds[['geocode_country_name']] = ds[['geocode_country_name']].astype('category')
ds[['sector_id']] = ds[['sector_id']].astype('category')
ds[['sector_name']] = ds[['sector_name']].astype('category')
ds[['activity_id']] = ds[['activity_id']].astype('category')
ds[['activity_name']] = ds[['activity_name']].astype('category')
ds['raisedDate'] = ds['raisedDate'].astype('datetime64[s]')
ds['fundraisingDate'] = ds['fundraisingDate'].astype('datetime64[s]')

ds[['loanAmount']] = ds[['loanAmount']].astype('float32')
ds[['loanFundraisingInfo_fundedAmount']] = ds[['loanFundraisingInfo_fundedAmount']].astype('float32')

# Preprocessing
## Keep only success loans

In [None]:
success = ds["loanAmount"] == ds["loanFundraisingInfo_fundedAmount"]
counts = success.value_counts()
counts[True] / (counts[True] + counts[False]), len(ds)

In [None]:
# keep success only
rmindex = ds[~success].index
ds.drop(rmindex, inplace=True)

## Drop some NaN

In [None]:
ds.isna().sum()

In [None]:
ds.dropna(subset=["loanAmount", "loanFundraisingInfo_fundedAmount", "fundraisingDate", "raisedDate"], inplace=True)

## Collection Speed

In [None]:
ds["funding_duration"] = ds["raisedDate"] - ds["fundraisingDate"]
ds["funding_duration_days"] = ds["funding_duration"].astype('int64') / (24 * 60 * 60)
ds["collection_speed"] = ds["loanFundraisingInfo_fundedAmount"] / ds["funding_duration_days"]
ds.head()

In [None]:
# some project is already fulfilled before publish
ds[ds["collection_speed"] < 0]

In [None]:

rmindex = ds[ds["collection_speed"] < 0].index
ds.drop(rmindex, inplace=True)

## MultiLabelBinarizer Tags

In [None]:
tagdf = ds[['tags']].to_pandas()
tagdf.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()

mlb = lb.fit_transform(tagdf['tags'])
mlb.shape

In [None]:
tags_columns = ['tag_' + i for i in lb.classes_]
tag_ds = cudf.DataFrame(mlb, columns=tags_columns, dtype='int8', index=tagdf.index)
del tagdf
tag_ds.sum()

In [None]:
# drop some meaningless tags
tag_ds.drop(['tag_', 'tag_user_favorite', 'tag_user_like'], axis=1, inplace=True)

In [None]:
# join with the original df
fullds = ds.join(tag_ds).drop(['tags'], axis=1)
fullds.head()

In [None]:
# copy-on-write, so remove
del ds
del tag_ds
import gc
gc.collect()

# Flatten

In [None]:
fullds = fullds.explode('lendingActions_values')

In [None]:
fullds['latestSharePurchaseDate'] = fullds['lendingActions_values'].struct.field('latestSharePurchaseDate')
fullds['latestSharePurchaseDate'] = cudf.to_datetime(fullds['latestSharePurchaseDate']).astype('datetime64[s]')
fullds['lender'] = fullds['lendingActions_values'].struct.field('lender')
fullds['shareAmount'] = fullds['lendingActions_values'].struct.field('shareAmount')

fullds['shareAmount'] = fullds['shareAmount'].astype('float32')

fullds.drop(['lendingActions_values'], axis=1, inplace=True)
fullds['lender_id'] = fullds['lender'].struct.field('id')
fullds['lender_name'] = fullds['lender'].struct.field('name')
fullds['lender_publicId'] = fullds['lender'].struct.field('publicId')
fullds.drop(['lender'], axis=1, inplace=True)

In [None]:
fullds.memory_usage().sum() / pow(2, 30)

In [None]:
fullds.to_feather('dfactions.feather')

In [None]:
fullds.to_csv('dfactions.csv', chunksize=10000)