In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Import raw data
First, read data in `.jsonl` file format as a pandas data frame
Then store the dataframe in `.parquet` format for easy access later

In [2]:
%%script false --no-raise-error

# sample data
df = pd.read_json("../data/kiva_activity_2023-08-28T11-04-30.jsonl", lines=True)
# full data, quite big 
# df = pd.read_json("../fulldata/kiva_activity_2023-08-28T11-09-39.jsonl", lines=True)
df = pd.json_normalize(df["loan"], sep='_')

In [3]:
%%script false --no-raise-error

df["loanAmount"] = df["loanAmount"].astype(float)
df["loanFundraisingInfo_fundedAmount"] = df["loanFundraisingInfo_fundedAmount"].astype(float)
df["raisedDate"] = pd.to_datetime(df["raisedDate"])
df["fundraisingDate"] = pd.to_datetime(df["fundraisingDate"])
df["geocode_country_name"] = df["geocode_country_name"].astype("category")
df["sector_id"] = df["sector_id"].astype(int)
df["sector_name"] = df["sector_name"].astype("category")
df["activity_id"] = df["activity_id"].astype(int)
df["activity_name"] = df["activity_name"].astype("category")

In [4]:
%%script false --no-raise-error
df.to_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [84]:
df = pd.read_parquet("../fulldata/kiva_activity_2023-08-28T11-09-39.parquet")

In [85]:
df.dropna(axis=0, how="all", inplace=True)
df.tail()

Unnamed: 0,id,name,fundraisingDate,raisedDate,loanAmount,tags,loanFundraisingInfo_fundedAmount,geocode_country_name,sector_id,sector_name,activity_id,activity_name,lendingActions_totalCount,lendingActions_values
2547758,3800,Anonymous,2007-01-16 23:10:03+00:00,2007-01-17 10:26:47+00:00,600.0,[],600.0,Kenya,1,Agriculture,61,Dairy,18,[{'latestSharePurchaseDate': '2007-01-17T03:14...
2547759,3799,Anonymous,2007-01-16 23:04:56+00:00,2007-01-17 18:59:15+00:00,125.0,[],125.0,Kenya,1,Agriculture,31,Farming,5,[{'latestSharePurchaseDate': '2007-01-17T02:01...
2547760,3797,Anonymous,2007-01-16 23:00:56+00:00,2007-01-16 23:06:31+00:00,150.0,[],150.0,Kenya,1,Agriculture,31,Farming,1,[{'latestSharePurchaseDate': '2007-01-16T23:06...
2547761,3796,Anonymous,2007-01-16 23:00:32+00:00,2007-01-17 00:47:14+00:00,300.0,[],300.0,Kenya,1,Agriculture,31,Farming,6,[{'latestSharePurchaseDate': '2007-01-16T23:09...
2547762,3795,Anonymous,2007-01-16 23:00:16+00:00,2007-01-17 18:41:08+00:00,750.0,[],750.0,Kenya,14,Construction,97,Cement,6,[{'latestSharePurchaseDate': '2007-01-16T23:25...


# Contruct a Graph

The idea is construct a graph with following node type
- `Lender`
- `Loan`
- `Tag`

With following relationships
- `Lender`s can `LEND` to `Loan`s
- `Loan`s can be `TAGGED_WITH` `Tag`s

Lenders have properties
- `id`
- `name`
- `publicId`

Loan have properties
- `id`
- `name`
- `loanAmount`
- `fundedAmount`
- `postDate`
- `raisedDate`

`Tag` have properties:
- `name`

LEND's properties
- `shareAmount`
- `date`

TAGGED_WITH have no properties

## Construct a simple graph using `Cypher`

In [None]:
from neo4j import GraphDatabase

NEO4J_URI="neo4j+s://6e2659a3.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="0svBjC26S8vPiiOs0IkuasaPeP5Gh71d_muDYcABxrM"
NEO4J_DBNAME="neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
driver.verify_connectivity()

In [None]:
Q_CREATE_TAGS = """
WITH $alltags AS batch
UNWIND batch AS name
MERGE (tag:Tag {name: name});
"""

In [None]:
alltags = df[['tags']].explode('tags').drop_duplicates().dropna()
alltags = list(alltags['tags'])
len(alltags)

In [None]:
with driver.session(database=NEO4J_DBNAME) as session:
    session.run(Q_CREATE_TAGS, alltags=alltags)

In [None]:
row=df.loc[8598].to_dict()

In [None]:
Q_CREATE_LOAN = """
MERGE (loan:Loan)
SET loan.id = $id,
    loan.name = $name,
    loan.fundraisingDate = $fundraisingDate,
    loan.raisedDate = $raisedDate,
    loan.loanAmount = $loanAmount,
    loan.loanFundraisingInfo_fundedAmount = $loanFundraisingInfo_fundedAmount,
    loan.geocode_country_name = $geocode_country_name,
    loan.sector_id = $sector_id,
    loan.sector_name = $sector_name,
    loan.activity_id = $activity_id,
    loan.activity_name = $activity_name;
"""

In [None]:
with driver.session(database=NEO4J_DBNAME) as session:
    session.run(Q_CREATE_LOAN, **row)

In [None]:
Q_TAGGED_WITH = """
WITH $tags AS batch
UNWIND batch AS name
MATCH (tag:Tag {name: name})
MATCH (loan:Loan {id: $id})
WITH loan,tag
MERGE (loan)-[:TAGGED_WITH]->(tag);
"""

In [None]:
with driver.session(database=NEO4J_DBNAME) as session:
    session.run(Q_TAGGED_WITH, **row)

In [None]:
Q_CREATE_LENDER = """
MATCH (loan:Loan {id: $id})
WITH loan, $lendingActions_values AS batch
UNWIND batch AS action
MERGE (lender:Lender {id: action.lender.id})
SET lender.name = action.lender.name,
    lender.publicId = action.lender.publicId
WITH loan, lender, action
MERGE (lender)-[r:LEND {shareAmount: action.shareAmount, date: action.latestSharePurchaseDate}]->(loan)
"""

In [None]:
with driver.session(database=NEO4J_DBNAME) as session:
    session.run(Q_CREATE_LENDER, **row)

### Create indexes

**Above procedure could works, but too slow**

## Construct full graph using `neo4j-admin database import`


> The most efficient way of performing a first import of large amounts of data into a new database is the neo4j-admin database import command.
(batch_data_creation)[https://neo4j.com/docs/python-manual/current/performance/#_batch_data_creation]

We now create 4 files like this

`tags.csv`

```csv
name:ID,:LABEL
women,Tag
user_favorite,Tag
```

`lenders.csv`

```csv
id:ID,name,publicId,:LABEL
123,"dat","datnt527",Lender
```

`loans.csv`

```csv
id:ID,name,fundraisingDate:date,raisedDate:date,loanAmount:float,loanFundraisingInfo_fundedAmount:float,geocode_country_name,sector_id,sector_name,activity_id,activity_name,:LABEL
2622552,'Elsa','2023-08-18T04:40:27Z','2023-08-21T16:46:54Z','550.00','550.00','Philippines',14,'Construction',24, 'Construction Supplies',Loan
```

relationshipo between `Lender` and `Loan`

`lender_loan.csv`

```csv
:START_ID,:END_ID,shareAmount,date,:TYPE
123,2622552,25.0,2023-04-10 00:00:00,LEND
```

`loan_tags.csv`

```csv
:START_ID,:END_ID,:TYPE
2622552,women,TAGGED_WITH
```

In [7]:
# create those df
df_tags = df[['tags']].explode('tags').drop_duplicates().dropna()
df_tags[':LABEL'] = 'Tag'
df_tags.rename(columns={'tags': 'name:ID'}, inplace=True)
df_tags.to_csv('../data/neo4jtry/tags.csv',index=False)

duplicated loan: same `id` but `funded_amount` different, maybe because of the query time

In [110]:
df_loan = df.drop(['tags', 'lendingActions_totalCount', 'lendingActions_values'], axis=1)
df_loan.drop_duplicates(inplace=True)

In [111]:
# remove duplicated loan: same id but funded_amount different, maybe because of the query time
duplicated_loan_id = df_loan[df_loan.duplicated(subset=['id'])]['id']
duplicated_loan = df_loan[df_loan['id'].isin(duplicated_loan_id)]
df_loan = df_loan[~df_loan['id'].isin(duplicated_loan_id)]

`df_loan` is not contains duplicates. Now, fix `duplicated_loan`.
For rows that have different `loanFundraisingInfo_fundedAmount`, keep the one with maximum value

In [112]:
temp = duplicated_loan.groupby('id', group_keys=False).apply(lambda x: x.loc[x.loanFundraisingInfo_fundedAmount.idxmax()])
df_loan = pd.concat([df_loan, temp])

In [113]:
df_loan[':LABEL'] = 'Loan'
df_loan.rename(columns={'id': 'id:ID(Loan-ID)'}, inplace=True)
df_loan.to_csv('../data/neo4jtry/loans.csv',index=False)

In [131]:
df_loan_tags = df[['id', 'tags']].explode(['tags'])
df_loan_tags.dropna(inplace=True)
df_loan_tags.drop_duplicates(inplace=True)
# care full with tag ''
df_loan_tags = df_loan_tags[df_loan_tags['tags'] != '']
df_loan_tags.rename(columns={'id': ':START_ID(Loan-ID)', 'tags':':END_ID'}, inplace=True)
df_loan_tags[':TYPE'] = 'TAGGED_WITH'
df_loan_tags.to_csv('../data/neo4jtry/loan_tags.csv', index=False)

In [57]:
df_lender = df[['lendingActions_values']].explode(['lendingActions_values'])
df_lender.dropna(inplace=True)
df_lender.iloc[0]['lendingActions_values']['lender']
df_lender = df_lender.progress_apply(lambda x: x['lendingActions_values']['lender'], axis=1)
df_lender = pd.json_normalize(df_lender)
df_lender.drop_duplicates(inplace=True)
# df_lender.rename(columns={'id': 'id:ID(Lender-ID)'}, inplace=True)
# df_lender[':LABEL'] = 'Lender'
# df_lender.to_csv('../data/neo4jtry/lenders.csv',index=False)

100%|██████████| 52028372/52028372 [02:54<00:00, 298515.97it/s]


In [71]:
# drop duplicated_lender who publicId is None
duplicated_lender = df_lender[df_lender.duplicated(subset=['id'])]['id']
should_remove = df_lender[(df_lender['id'].isin(duplicated_lender)) & (df_lender['publicId'].isna())]
df_lender.drop(should_remove.index, axis=0, inplace=True)
# still duplicate, might be because user change name and publicId. Just remove duplicates here.
df_lender.drop_duplicates(subset='id', inplace=True)
# display duplicated
# df_lender[(df_lender['id'].isin(df_lender[df_lender.duplicated(subset=['id'])]['id']))].sort_values('id')

In [72]:
df_lender.rename(columns={'id': 'id:ID(Lender-ID)'}, inplace=True)
df_lender[':LABEL'] = 'Lender'
df_lender.to_csv('../data/neo4jtry/lenders.csv',index=False)

In [15]:
df_lender_loan = df[['id', 'lendingActions_values']].explode(['lendingActions_values'])
df_lender_loan.dropna(inplace=True)
df_lender_loan['lender_id'] = df_lender_loan.progress_apply(lambda x: x['lendingActions_values']['lender']['id'], axis=1)
df_lender_loan['shareAmount'] = df_lender_loan.progress_apply(lambda x: x['lendingActions_values']['shareAmount'], axis=1)
df_lender_loan['date'] = df_lender_loan.progress_apply(lambda x: x['lendingActions_values']['latestSharePurchaseDate'], axis=1)
df_lender_loan.drop(['lendingActions_values'], axis=1, inplace=True)
df_loan_tags.drop_duplicates(inplace=True)
df_lender_loan[':TYPE'] = 'LEND'
df_lender_loan.rename(columns={'lender_id': ':START_ID(Lender-ID)', 'id':':END_ID(Loan-ID)'}, inplace=True)
# df_lender_loan.to_csv('../data/neo4jtry/lender_loan.csv', index=False)

100%|██████████| 52028372/52028372 [03:02<00:00, 284700.55it/s]
100%|██████████| 52028372/52028372 [02:55<00:00, 297087.34it/s]
100%|██████████| 52028372/52028372 [03:00<00:00, 288700.72it/s]
