In [None]:
import numpy as np
import pandas as pd
import dtale  # for visualize
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
# sklearn
from sklearn.preprocessing import OneHotEncoder
import networkx as nx

tqdm.pandas()

# Import raw data
First, read data in `.jsonl` file format as a pandas data frame
Then store the dataframe in `.parquet` format for easy access later

In [None]:
df = pd.read_json("../data/kiva_activity_2023-08-28T11-04-30.jsonl", lines=True)
df = pd.json_normalize(df["loan"], sep='_')
# df.to_parquet("../fulldata/kiva_activity_2023-08-28T10-16-29.parquet")


In [None]:
# df = pd.read_parquet("../fulldata/kiva_2023-08-10T17-57-12.parquet")
# 24 rows are all-na, don't know why
df.dropna(axis=0, how="all", inplace=True)

In [None]:
df.tail()

store the name of interesting columns for easy access

## We keep only the success loans

In [None]:
success = df['loanAmount'] == df['loanFundraisingInfo_fundedAmount']
counts = success.value_counts()
counts[True] / (counts[True] + counts[False])

In [None]:
# keep success only
df = df[success]

## Drop some NaN

In [None]:
df.isna().sum()

The NaN count result is
| column name | NaN count |
|-------------|-----------| 
|loanAmount                       | 0 |
|loanFundraisingInfo.fundedAmount | 0 |
|raisedDate                       | 2 |
|fundraisingDate                  | 0 |
|tags                             | 0 |
|disbursalDate                    | 4 |

it would be ok to remove mirror na

## Contruct a Graph

## Let's construct a simple graph

In [None]:
from enum import StrEnum
from pydantic import BaseModel, ValidationError, Field
from typing import Optional, Any, List, Annotated
from datetime import datetime

class BaseHashedModel(BaseModel):
    def __hash__(self):
        return hash((type(self),) + tuple(self.__dict__.values()))

TagType = Annotated[str, "just tags"]

class NodeType(StrEnum):
    LOAN = "loan"
    LENDER = "lender"
    TAG = "tag"

class Lender(BaseHashedModel):
    id: int
    name: Optional[str]
    publicId: Optional[str] = Field(description="can be None if anonymous")

    def __str__(self) -> str:
        return self.publicId


class Loan(BaseHashedModel):
    id: int
    name: str
    loanAmount: float
    fundedAmount: float

    def __str__(self) -> str:
        return str(self.id)

class LendAction(BaseHashedModel):
    lender: Lender
    shareAmount: float
    teams: List[str]
    latestSharePurchaseDate: datetime

In [None]:
G = nx.Graph()
loan_nodes = []
lender_nodes = []
tag_nodes = []

G.add_edge(
    lender := Lender(id=2, name="fakelender", publicId="firstpublicid"), 
    loan := Loan(id=1, name='fakeloan', loanAmount=1000, fundedAmount=25), 
    amount=25, 
    latestSharePurchaseDate=datetime.now()
)
loan_nodes.append(loan)
lender_nodes.append(lender)

G.add_edge(
    lender := Lender(id=3, name="fakelender", publicId="secondpublicid"), 
    loan, 
    amount=25, 
    latestSharePurchaseDate=datetime.now()
)
lender_nodes.append(lender)

# tag
G.add_edge(
    tag := TagType("this is a tag"),
    loan
)
tag_nodes.append(tag)

pos = nx.spring_layout(G, seed=3113794652)  # positions for all nodes
options = {"edgecolors": "tab:gray", "node_size": 800, "alpha": 0.9}

nx.draw_networkx_nodes(G, pos, nodelist=tag_nodes, node_color="tab:green", **options)
nx.draw_networkx_nodes(G, pos, nodelist=loan_nodes, node_color="tab:red", **options)
nx.draw_networkx_nodes(G, pos, nodelist=lender_nodes, node_color="tab:blue", **options)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)

# Draw node labels
node_labels = {node: node for node in G.nodes}
nx.draw_networkx_labels(G, pos, labels=node_labels, font_weight='bold')

# Draw edge labels
def edge_label(u, v, data):
    try:
        label = f"Amount: {data['amount']}\nDate: {data['latestSharePurchaseDate']}"
    except KeyError:
        label = ""
    return label
edge_labels = {(u, v): edge_label(u, v, data) for u, v, data in G.edges(data=True)}

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.show()

## Now construct a graph from crawled data

loop through each loan, then add following 
- add the loan itself as a node
- add contributors of the loan as a node. `networkx` can handle the duplicated by default
- add the loan's tags as a node as well. Also check duplicating

In [None]:
import networkx as nx
G = nx.Graph()
loan_nodes = []
lender_nodes = []
tag_nodes = []

In [None]:
row.tags

In [None]:
for row in tqdm(df.itertuples()):
    
    G.add_node(
        loan := Loan(id=row.id, name=row.name, loanAmount=float(row.loanAmount), fundedAmount=float(row.loanFundraisingInfo_fundedAmount))
    )
    loan_nodes.append(loan)
    # tag
    for tag in row.tags:
        G.add_edge(
            tag := TagType(tag),
            loan
        )
        tag_nodes.append(tag)

    for action in row.lendingActions_values:
        try:
            G.add_edge(
                lender := Lender(**action['lender']), 
                loan, 
                amount=action['shareAmount'], 
                latestSharePurchaseDate=action['latestSharePurchaseDate']
            )
            lender_nodes.append(lender)
        except ValidationError as e:
            print(row)
            print(action)
            print(e)
            raise e

In [None]:
import plotly.graph_objects as go

edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

In [None]:
# Draw
# pos = nx.spring_layout(G, seed=3113794652)  # positions for all nodes
pos = nx.random_layout(G, seed=3113794652)  # positions for all nodes
options = {"edgecolors": "tab:gray", "node_size": 800, "alpha": 0.9}

nx.draw_networkx_nodes(G, pos, nodelist=tag_nodes, node_color="tab:green", **options)
nx.draw_networkx_nodes(G, pos, nodelist=loan_nodes, node_color="tab:red", **options)
nx.draw_networkx_nodes(G, pos, nodelist=lender_nodes, node_color="tab:blue", **options)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)

# Draw node labels
node_labels = {node: node for node in G.nodes}
nx.draw_networkx_labels(G, pos, labels=node_labels, font_weight='bold')

# Draw edge labels
def edge_label(u, v, data):
    try:
        label = f"Amount: {data['amount']}\nDate: {data['latestSharePurchaseDate']}"
    except KeyError:
        label = ""
    return label
edge_labels = {(u, v): edge_label(u, v, data) for u, v, data in G.edges(data=True)}

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.show()