In [None]:
DEVICES = "2,3"

import os
os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import numpy as np
import pandas as pd
from tqdm import tqdm
import cudf

tqdm.pandas()

# Import raw data
First, read data in `.jsonl` file format as a pandas data frame
Then store the dataframe in `.parquet` format for easy access later

In [None]:
%%script false --no-raise-error

df = pd.read_json("../fulldata/kiva_activity_2023-08-28T11-09-39.jsonl", lines=True)
df = pd.json_normalize(df["loan"], sep='_')

In [None]:
%%script false --no-raise-error

df["loanAmount"] = df["loanAmount"].astype(float)
df["loanFundraisingInfo_fundedAmount"] = df["loanFundraisingInfo_fundedAmount"].astype(float)
df["raisedDate"] = pd.to_datetime(df["raisedDate"])
df["fundraisingDate"] = pd.to_datetime(df["fundraisingDate"])
df["geocode_country_name"] = df["geocode_country_name"].astype("category")
df["sector_id"] = df["sector_id"].astype(int)
df["sector_name"] = df["sector_name"].astype("category")
df["activity_id"] = df["activity_id"].astype(int)
df["activity_name"] = df["activity_name"].astype("category")

In [None]:
%%script false --no-raise-error
df.to_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds = cudf.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [None]:
ds.dropna(axis=0, how="all", inplace=True)
ds.tail()

# Filter

Filtering, only take `Vietnam` into account
Why? Because there are a lot of rows and we try to localize the task

In [None]:
ds['geocode_country_name'].value_counts()['Vietnam']

In [None]:
ds = ds[ds['geocode_country_name'] == 'Vietnam']

In [None]:
ds.head()

# Contruct a Graph

The idea is construct a graph with following node type
- `Lender`
- `Loan`
- `Tag`

With following relationships
- `Lender`s can `LEND` to `Loan`s
- `Loan`s can be `TAGGED_WITH` `Tag`s

Lenders have properties
- `id`
- `name`
- `publicId`

Loan have properties
- `id`
- `name`
- `loanAmount`
- `fundedAmount`
- `postDate`
- `raisedDate`

`Tag` have properties:
- `name`

LEND's properties
- `shareAmount`
- `date`

TAGGED_WITH have no properties

## Remove duplicated `loan`

There are loans which have a same `id` but different `fundedAmount`
It might because the query time is different
Here, only keep records which have the highest `fundedAmount`

In [None]:
ds.loc[[9628, 1366545]]

In [None]:
temp = ds.groupby('id', group_keys=False)[['loanFundraisingInfo_fundedAmount']].idxmax()
iloc = temp['loanFundraisingInfo_fundedAmount'].values # NOTE: just iloc, not loc
ds = ds.iloc[iloc]
del iloc
del temp
ds.loc[[9628, 1366545]] # see, only keep the one with higher fundedAmount

In [None]:
ds[ds.duplicated(subset=['id'], keep=False)].sort_values(by=['id']) # no duplicated

## create `lender-loan-tag` df

In [None]:
ads = ds.explode('tags').explode('lendingActions_values')
del ds
len(ads)

In [None]:
ads['tags'] = ads['tags'].astype('category')

In [None]:
# drop some loans that has no lender
ads.dropna(subset=['lendingActions_values'], inplace=True)

In [None]:
# dict processing cant be done in cuDF, so convert to pandas
adf = ads.to_pandas()

In [None]:
adf['lender_id'] = adf.progress_apply(lambda x: x['lendingActions_values']['lender']['id'], axis=1).astype(int)
adf['lender_name'] = adf.progress_apply(lambda x: x['lendingActions_values']['lender']['name'], axis=1)
adf['lender_publicId'] = adf.progress_apply(lambda x: x['lendingActions_values']['lender']['publicId'], axis=1)
adf['shareAmount'] = adf.progress_apply(lambda x: x['lendingActions_values']['shareAmount'], axis=1).astype(float)
adf['date'] = pd.to_datetime(adf.progress_apply(lambda x: x['lendingActions_values']['latestSharePurchaseDate'], axis=1))

In [None]:
# cuDF do not work with timezone yet
adf['date'] = adf['date'].dt.tz_localize(None)

In [None]:
ads = cudf.from_pandas(adf)
del adf

In [None]:
ads.drop(['lendingActions_values'], axis=1, inplace=True)

In [None]:
ads.drop_duplicates(inplace=True)

In [None]:
ads.to_feather("ads.feather")

In [None]:
DEVICES = "2,3"

import os
os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import numpy as np
import pandas as pd
from tqdm import tqdm
import cudf

tqdm.pandas()

ads = cudf.read_feather("ads.feather")

## Remove some tags 
The folowing tags should be remove:  
- `tag_`
- `user_favorite`
- `user_like`
- `volunteer_like`
- `volunteer_pick`

In [None]:
if 'user_like' not in ads['tags'].cat.categories:
    ads['tags'] = ads['tags'].cat.add_categories(['user_like'])

if 'user_favorite' not in ads['tags'].cat.categories:
    ads['tags'] = ads['tags'].cat.add_categories(['user_favorite'])

if 'volunteer_like' not in ads['tags'].cat.categories:
    ads['tags'] = ads['tags'].cat.add_categories(['volunteer_like'])

if 'volunteer_pick' not in ads['tags'].cat.categories:
    ads['tags'] = ads['tags'].cat.add_categories(['volunteer_pick'])

In [None]:
(ads['tags'] == 'user_favorite').sum(), (ads['tags'] == 'user_like').sum(), (ads['tags'] == 'volunteer_like').sum(), (ads['tags'] == 'volunteer_pick').sum()

In [None]:
ads = ads[~ads['tags'].isin(['user_favorite', 'user_like', 'volunteer_like', 'volunteer_pick'])]

## create `Tag` nodes

In [None]:
# create those df
ds_tags = ads[['tags']].drop_duplicates().dropna()
ds_tags[':LABEL'] = 'Tag'
ds_tags.rename(columns={'tags': 'name:ID'}, inplace=True)
ds_tags.to_csv('../data/neo4jtry/tags.csv',index=False)
del ds_tags

## create `Loan` nodes

In [None]:
ds_loan = ads.drop(['tags', 'lendingActions_totalCount', 'lender_id', 'lender_name', 'lender_publicId', 'shareAmount', 'date'], axis=1).drop_duplicates()

In [None]:
ds_loan[':LABEL'] = 'Loan'
ds_loan.rename(columns={'id': 'id:ID(Loan-ID)'}, inplace=True)
ds_loan.to_csv('../data/neo4jtry/loans.csv',index=False)
del ds_loan

## create `Lender` nodes

In [None]:
ds_lender = ads[['lender_id', 'lender_name', 'lender_publicId']].drop_duplicates()
ds_lender.tail(2)

In [None]:
ds_lender[ds_lender.duplicated(subset=['lender_id'], keep=False)].sort_values(by=['lender_id'])

In [None]:
duplicated_lender_id = ds_lender[ds_lender.duplicated(subset=['lender_id'])]['lender_id']
should_remove = ds_lender[(ds_lender['lender_id'].isin(duplicated_lender_id)) & (ds_lender['lender_publicId'].isna())]
ds_lender.drop(should_remove.index, axis=0, inplace=True)

In [None]:
# drop duplicated_lender who publicId is None
duplicated_lender_id = ds_lender[ds_lender.duplicated(subset=['lender_id'])]['lender_id']
should_remove = ds_lender[(ds_lender['lender_id'].isin(duplicated_lender_id)) & (ds_lender['lender_publicId'].isna())]
ds_lender.drop(should_remove.index, axis=0, inplace=True)
# still duplicate, might be because user change name and publicId. Just remove duplicates here.
ds_lender.drop_duplicates(subset='lender_id', inplace=True)
del duplicated_lender_id
del should_remove
# display the duplicated
ds_lender[ds_lender.duplicated(subset=['lender_id'], keep=False)]

In [None]:
ds_lender.rename(columns={'id': 'id:ID(Lender-ID)'}, inplace=True)
ds_lender[':LABEL'] = 'Lender'
ds_lender.to_csv('../data/neo4jtry/lenders.csv',index=False)
del ds_lender

## Create `TAGGED_WITH` relationship between `Loan` and `Tags`

In [None]:
ds_loan_tags = ads[['id', 'tags']].dropna() # dropna helps to avoid inplace here, because we're process in a slide of the `ads`
ds_loan_tags.isna().sum()

In [None]:
'the number of loans is ', len(ads[['id']].drop_duplicates())

In [None]:
ds_loan_tags.drop_duplicates(inplace=True)
ds_loan_tags.duplicated().sum()

In [None]:
'the number of loan-tag relationships is', len(ds_loan_tags)

In [None]:
ds_loan_tags['tags'].value_counts()

In [None]:
ds_loan_tags.rename(columns={'id': ':START_ID(Loan-ID)', 'tags': ':END_ID'}, inplace=True)
ds_loan_tags[':TYPE'] = 'TAGGED_WITH'
ds_loan_tags.to_csv('../data/neo4jtry/loan_tags.csv', index=False)
del ds_loan_tags

## create `LEND` relationship between `Lender` and `Loan`

In [None]:
ds_lender_loan = ads[['id', 'lender_id', 'shareAmount', 'date']]
ds_lender_loan.dropna(inplace=True)
ds_lender_loan.tail(5)

In [None]:
ds_lender_loan.drop_duplicates(inplace=True) # duplicated cause by exploding the tags

In [None]:
ds_lender_loan[':TYPE'] = 'LEND'
ds_lender_loan.rename(columns={'lender_id': ':START_ID(Lender-ID)', 'id':':END_ID(Loan-ID)'}, inplace=True)
ds_lender_loan.to_csv('../data/neo4jtry/lender_loan.csv', index=False)
del ds_lender_loan

## Manually create the `SHARES_LOANS` relationship

In [None]:
ds_lender_loan = ads[['id', 'lender_id', 'shareAmount', 'date']]
ds_lender_loan.dropna(inplace=True)
ds_lender_loan.drop_duplicates(inplace=True) # duplicated cause by exploding the tags
ds_lender_loan.drop(['shareAmount', 'date'], axis=1, inplace=True)
ds_lender_loan.tail(2)

In [None]:
ds_lender_loan.to_feather("ds_lender_loan.feather")

In [None]:
ds_lender_loan = cudf.read_feather("ds_lender_loan.feather")

In [None]:
ds_lender_loan.info()

In [None]:
ds_lender_loan.lender_id.nunique()

In [None]:
ds_lender_loan.id.unique()

In [None]:
ads[ads['tags'] == '#Married'].id.unique()

In [None]:
ds_lender_loan.id.nunique()

In [None]:
"expected number of row of self_merge is", pow(ds_lender_loan.lender_id.nunique()/ds_lender_loan.id.nunique(), 2) * ds_lender_loan.id.nunique()

In [None]:
self_merged = ds_lender_loan.merge(ds_lender_loan, on='id')
del ds_lender_loan
self_merged.head(3)

In [None]:
self_merged = self_merged[self_merged['lender_id_x'] > self_merged['lender_id_y']]

In [None]:
shares_loan = self_merged.groupby(['lender_id_x', 'lender_id_y']).nunique().reset_index()
shares_loan.rename(columns={'id': "number_common_loans"}, inplace=True)
del self_merged
shares_loan.head()

In [None]:
len(shares_loan)

In [None]:
shares_loan.rename(
    columns={
        'lender_id_x': ':START_ID(Lender-ID)', 
        'lender_id_y': ':END_ID(Lender-ID)',
        'id': 'weight'
        }, inplace=True)
shares_loan[':TYPE'] = 'SHARES_LOAN'
shares_loan.to_csv('../data/neo4jtry/lender_lender_share_loan.csv', index=False)
del shares_loan

## Manually creat the `INTEREST` relationship

In [None]:
lender_tag_ds = ads[['id', 'tags', 'lender_id', 'shareAmount', 'date']]
lender_tag_ds.tail(3)

In [None]:
lender_tag_ds.duplicated().sum()

In [None]:
lender_tag_ds = lender_tag_ds.groupby(['lender_id', 'tags']).count()
lender_tag_ds.reset_index(inplace=True)
lender_tag_ds

In [None]:
lender_tag_ds.drop(['shareAmount', 'date'], axis=1, inplace=True)
lender_tag_ds.rename(columns={'id': 'weight'}, inplace=True)
lender_tag_ds.sort_values(by=['weight'], ascending=False).head()

In [None]:
lender_tag_ds[':TYPE'] = 'INTEREST'
lender_tag_ds.rename(columns={'lender_id': ':START_ID(Lender-ID)', 'tags':':END_ID(Tag-ID)'})\
    .to_csv('../data/neo4jtry/lender_tag.csv', index=False)

## Manually create the `SHARE_TAGS` relationship

In [None]:
lender_tag_ds.drop(['weight', ':TYPE'], axis=1, inplace=True)
lender_tag_ds.head()

In [None]:
lender_tag_ds.info()

In [None]:
import gc
gc.collect()

In [None]:
ads.memory_usage().sum() / pow(2, 30)

In [None]:
lender_tag_ds.memory_usage().sum() / pow(2, 30)

In [None]:
lender_tag_ds.to_csv('temp.csv', index=False)

### Re-load data

In [None]:
DEVICES = "1,2"
n_devices = len(DEVICES.split(','))

import os
os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import numpy as np
import pandas as pd
from tqdm import tqdm
import cudf
import dask_cudf

tqdm.pandas()

lender_tag_ds : cudf.DataFrame = cudf.read_csv('temp.csv')
lender_tag_ds['lender_id'] = lender_tag_ds['lender_id'].astype('uint32')
lender_tag_ds['tags'] = lender_tag_ds['tags'].astype('category')
lender_tag_ds.info()

In [None]:
# label encoding the tags, for easy integration with `parquet` and later programming. But, remember to store the label encoder
from cuml.preprocessing.LabelEncoder import LabelEncoder
le = LabelEncoder()
lender_tag_ds['tags'] = le.fit_transform(lender_tag_ds['tags'])

In [None]:
lender_tag_ds = lender_tag_ds.sort_values(by=['tags'])

In [None]:
lender_tag_ds.info()

In [None]:
lender_tag_ds.lender_id.nunique()

In [None]:
lender_tag_ds.tags.nunique()

In [None]:
exp_row = pow(lender_tag_ds.lender_id.nunique()/lender_tag_ds.tags.nunique(), 2) * lender_tag_ds.tags.nunique()
"expected number of row of self_merged is", exp_row

### Using SQL

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///lender_tag.sqlite', echo=False)
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:32772/datdb', echo=False)

In [None]:
lender_tag_ds.to_pandas().to_sql(name='lender_tag', con=engine, index=False)

### Using `Dask-cuDF`

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=DEVICES, memory_limit="auto", device_memory_limit="auto", n_workers=None)
client = Client(cluster)
client

defined the computation graph, and execute it in background

In [None]:
# lds = lender_tag_ds.set_index('tags')
lds = lender_tag_ds
ddf : dask_cudf.DataFrame = dask_cudf.from_cudf(lds, npartitions=1024) # here we use tags as index, so it would split into 32 partitions
# ddf : dask_cudf.DataFrame = dask_cudf.from_cudf(lds, chunksize=1024)
print("number of divisior", len(ddf.divisions))

merged : dask_cudf.DataFrame = ddf.merge(ddf, on='tags', npartitions=8192)
print("number of divisior", len(merged.divisions))
# merged = merged.repartition(npartitions=8192)
# print("number of divisior", len(merged.divisions))

filtered : dask_cudf.DataFrame = merged[merged['lender_id_x'] > merged['lender_id_y']]
# filtered = filtered.repartition(npartitions=1024)
# filtered = filtered.reset_index() # should avoid this
print("number of divisior", len(filtered.divisions))
filtered = client.persist(filtered)


def nunique(series):
    return series.nunique()

share_tags_ds_dask_collection : dask_cudf.DataFrame = filtered.groupby(['lender_id_x', 'lender_id_y'], sort=False).tags.apply(nunique, meta=('tags', 'int64'))
print("number of divisior", len(share_tags_ds_dask_collection.divisions))

share_tags_ds_dask_collection_persist = client.persist(share_tags_ds_dask_collection)
share_tags_ds_dask_collection_persist.dask

In [None]:
%%script false --no-raise-error

lender_tag_ds = lender_tag_ds.sort_values(by=['tags', 'lender_id'])
lender_tag_ds.reset_index(drop=True, inplace=True) # make sure the input index is monotonically-increasing
ddf : dask_cudf.DataFrame = dask_cudf.from_cudf(lender_tag_ds, sort=False, chunksize=1024, npartitions=1024) # nparition < number of unique tags
# ddf : dask_cudf.DataFrame = dask_cudf.from_cudf(lender_tag_ds, npartitions=32) # nparition < number of unique tags
merged : dask_cudf.DataFrame = ddf.merge(ddf, on='tags')
merged = merged.repartition(npartitions=8192)
# merged_meta = cudf.DataFrame(columns=['lender_id_x', 'lender_id_y', 'tags'], dtype={'lender_id_x': 'uint32', 'lender_id_y': 'uint32', 'tags': 'category'})
filtered : dask_cudf.DataFrame = merged[merged['lender_id_x'] > merged['lender_id_y']]
# filtered : dask_cudf.DataFrame = filtered.repartition(npartitions=8192)


def nunique(series):
    return series.nunique()

share_tags_ds_dask_collection : dask_cudf.DataFrame = filtered.groupby(['lender_id_x', 'lender_id_y'], sort=False).tags.apply(nunique, meta=('tags', 'int64'))


excute the computatation in background

In [None]:
dict(share_tags_ds_dask_collection_persist.dask)

In [None]:
fut = share_tags_ds_dask_collection_persist.dask[('nunique-af4acf0676342f0939f34bea6059690e', 999)]
fut.result()

In [None]:
# filtered_gathered = client.gather(filtered_persist)

In [None]:
share_tags_ds_gathered = client.gather(share_tags_ds_dask_collection_persist)

In [None]:
share_tags_ds = share_tags_ds_gathered.compute()

In [None]:
share_tags_ds = share_tags_ds.rename('common_tags_count')

In [None]:
share_tags_ds = share_tags_ds.to_frame()

In [None]:
share_tags_ds.to_parquet("share_tags_ds_1.parquet")

In [None]:
lazada = cudf.read_parquet("share_tags_ds_1.parquet")

In [None]:
lazada.sort_values('common_tags_count', ascending=False).reset_index().info()

# `cuGRAPH`

In [None]:
import cugraph as cnx

G = cnx.Graph()