In [None]:
import cudf
import pandas as pd
import cugraph as cnx
from tqdm import tqdm
from matplotlib import pyplot as plt

tqdm.pandas()

In [None]:
vn = pd.read_parquet("checkpoints/vn_since_20200101.parquet")
vn["sector_name"] = vn["sector_name"].astype("category")
vn["geocode_country_name"] = vn["geocode_country_name"].astype("category")
vn["activity_name"] = vn["activity_name"].astype("category")
vn["tags"] = vn["tags"].astype("category")
vn.info()

In [None]:
assert 0 == vn.duplicated().sum()

In [None]:
vn.project_id.nunique()

In [None]:
vn.lender_id.nunique()

In [None]:
vn.loan_id.nunique() == len(vn.drop(columns=["tags"]).drop_duplicates())
vn.loan_id.nunique()

# Create the graph

when we consider `#Vegan` only


| lender_id | Vegan_contrib | all_contrib | F |
|-----------|--------|---|--|
| 123       |        |   |--|

In [None]:
ds = vn.groupby("lender_id").loan_id.count().rename("all_contrib").to_frame()
ds["Vegan_contrib"] = vn[vn["tags"] == "#Vegan"].groupby("lender_id").loan_id.count()
# only keep lenders that have vegan
ds.dropna(subset="Vegan_contrib", inplace=True)
ds["Vegan_contrib"] = ds["Vegan_contrib"].astype("int")
ds["F"] = ds["Vegan_contrib"] / ds["all_contrib"]
ds.reset_index(inplace=True)
print(len(ds))
ds.head()

## Now actual create the graph

In [None]:
# ds = ds.head(100)

In [None]:
merged = ds.merge(ds, how="cross")
merged = merged[merged["lender_id_x"] > merged["lender_id_y"]]
merged.head()

In [None]:
merged["weight"] = merged[["Vegan_contrib_x", "Vegan_contrib_y"]].min(axis=1) * (
    1 - (merged["F_x"] - merged["F_y"]).abs() / (merged["F_x"] + merged["F_y"])
)

In [None]:
merged.drop(
    columns=["all_contrib_x", "Vegan_contrib_x", "F_x", "all_contrib_y", "Vegan_contrib_y", "F_y"], inplace=True
)
merged.rename(columns={"lender_id_x": "source", "lender_id_y": "target"}, inplace=True)
merged

In [None]:
merged["source"] = merged["source"].astype("int32")
merged["target"] = merged["target"].astype("int32")
merged["weight"] = merged["weight"].astype("float32")
merged.info()

In [None]:
merged_ds = cudf.from_dataframe(merged, allow_copy=True)
del merged

In [None]:
from cugraph.structure import NumberMap

mapped_ds, number_map = NumberMap.renumber(merged_ds, ["source"], ["target"])
mapped_ds.rename(columns={"renumbered_src": "source", "renumbered_dst": "target"}, inplace=True)

In [None]:
G = cnx.Graph()
G.from_cudf_edgelist(mapped_ds, source="source", destination="target", weight="weight", renumber=False)
len(G.nodes()), len(G.edges()), G.is_renumbered()

# Visualize

## `cuxfilter`

In [None]:
pos_gdf = cnx.force_atlas2(
    G,
    edge_weight_influence=1,
    scaling_ratio=20.0,
    strong_gravity_mode=False,
    gravity=1.0,
    verbose=False,
)
pos_gdf

In [None]:
# Extract the edge data as a cudf DataFrame
edges_ds = G.view_edge_list()

In [None]:
import cuxfilter

cux_df = cuxfilter.DataFrame.load_graph((pos_gdf, edges_ds))

chart0 = cuxfilter.charts.datashader.graph(node_pixel_shade_type="linear", unselected_alpha=0.2)

d = cux_df.dashboard([chart0], layout=cuxfilter.layouts.double_feature)

d.app()

# Analysis

In [None]:
# define the parameters
max_iter = 100  # The maximum number of iterations
tol = 0.00001  # tolerance
alpha = 0.85  # alpha

In [None]:
pagerank = cnx.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol)
pagerank = number_map.unrenumber(pagerank, column_name="vertex")

pagerank.rename(columns={"pagerank": "share_vegan_pagerank"}, inplace=True)
pagerank.set_index("vertex", inplace=True)
pagerank.head()

In [None]:
louvain, mod_louvain = cnx.louvain(G)
louvain = number_map.unrenumber(louvain, column_name="vertex")
print(mod_louvain)
print(louvain.partition.nunique())
louvain.rename(columns={"partition": "louvain_id"}, inplace=True)
louvain.set_index("vertex", inplace=True)
louvain.head()

In [None]:
leiden, mod_leiden = cnx.leiden(G)
leiden = number_map.unrenumber(leiden, column_name="vertex")
print(mod_leiden)
print(leiden.partition.nunique())
leiden.rename(columns={"partition": "leiden_id"}, inplace=True)
leiden.set_index("vertex", inplace=True)
leiden.head()

In [None]:
cudf.concat([pagerank, leiden, louvain], axis=1)

In [None]:
ret = cudf.concat([pagerank, leiden, louvain], axis=1).reset_index()
ret.rename(columns={"vertex": "lender_id"}, inplace=True)
ret.to_csv("../data/gen/lender_community.csv", index=False)
ret

In [None]:
%%script false --no-raise-error
!sudo cp ../data/gen/lender_community.csv ../data/neo4jtry/lender_community.csv
!sudo chown -R 101:101 ../data/neo4jtry   

```cypher
LOAD CSV FROM "/csv_data/lender_community.csv" WITH HEADER AS ROW
MATCH (l:Lender {id: toInteger(ROW['lender_id'])})
SET l.leiden_id = toInteger(ROW['leiden_id'])
SET l.louvain_id = toInteger(ROW['louvain_id'])
SET l.share_vegan_pagerank = toFloat(ROW['share_vegan_pagerank']);
```