In [None]:
DEVICES = "2"

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = DEVICES

import numpy as np
import pandas as pd
import cudf
import plotly.express as px
from tqdm import tqdm
import plotly.graph_objects as go
from plotly.subplots import make_subplots

tqdm.pandas()

# Import data

In [None]:
ds = cudf.read_parquet("../data/gen/preprocessed_2023-08-28T11-09-39.parquet")

In [None]:
ds = ds[(ds["tags"] != "") & (ds["tags"] != "empty") & (ds["tags"] != "reserved_crisis_support_loan")]

## We keep only the success loans

In [None]:
success = ds.loanAmount == ds.loanFundraisingInfo_fundedAmount
counts = success.value_counts()
counts[True] / (counts[True] + counts[False]), len(ds)

In [None]:
ds = ds[success]

In [None]:
ds.dropna(subset=["raisedDate"], inplace=True)

In [None]:
# ds = ds.sample(1000)

In [None]:
PT = ds[
    ["project_id", "fundraisingDate", "raisedDate", "loanAmount", "sector_name", "geocode_country_name", "tags"]
].copy()
PT.drop_duplicates(inplace=True)

In [None]:
assert 0 == ds.loanAmount.isna().sum()
assert 0 == (ds.loanAmount < 0).sum()

# Distribution of Projects across Tags 

In [None]:
project_per_tag = PT.groupby("tags").agg({"project_id": "nunique", "loanAmount": ["mean", "std"]})
project_per_tag.columns = project_per_tag.columns.map(lambda x: f"{x[0]}_{x[1]}" if x[0] else x[1])
project_per_tag.reset_index(inplace=True)
project_per_tag.rename(columns={"project_id_nunique": "project_count"}, inplace=True)
project_per_tag = project_per_tag.sort_values("project_count", ascending=False)

In [None]:
# coefficient of variation
project_per_tag["loanAmount_CV"] = project_per_tag["loanAmount_std"] / project_per_tag["loanAmount_mean"] * 100

In [None]:
project_per_tag = project_per_tag.to_pandas()

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Bar(x=project_per_tag.tags, y=project_per_tag.loanAmount_mean, name="Average Loan Amount"),
    secondary_y=True,
)

# fig.add_trace(
#     go.Scatter(x=project_per_tag.tags, y=project_per_tag.loanAmount_std, name="Loan Amount Std", mode="lines"),
#     secondary_y=True,
# )

fig.add_trace(
    go.Scatter(x=project_per_tag.tags, y=project_per_tag.project_count, name="Project Count", mode="lines"),
    secondary_y=False,
)

# Add figure title
# fig.update_layout(title_text="Projects per tag")

# Set x-axis title
fig.update_xaxes(title_text="Tags")

# Set y-axes titles
fig.update_yaxes(title_text="Project Count", secondary_y=False)
fig.update_yaxes(title_text="Average Loan Amount", secondary_y=True)
# fig.update_yaxes(title_text="Loan Amount Std", secondary_y=True, type="log")

# Increase the size of the plot
fig.update_layout(height=500, width=1000, margin=dict(l=5, r=5, t=5, b=5))

# Set legend position
fig.update_layout(legend=dict(x=0.1, y=0.9))

# Set font
fig.update_layout(font_family="Computer Modern", font_size=14)

fig.show()

In [None]:
fig.write_image("images/project-vs-tag.pdf", format="pdf")

In [None]:
# convert to latex
# merged = project_per_tag.round(2).rename(
#     columns={
#         "tags": "Tag",
#         "project_count": "Project Count",
#         "loanAmount_mean": "Average Loan Amount",
#         "loanAmount_std": "Loan Amount Std",
#         "loanAmount_CV": "Loan Amount Coefficient of Variation (percentage)",
#     }
# ).reset_index(drop=True)

merged = (
    project_per_tag[["tags", "project_count", "loanAmount_mean"]]
    .round(2)
    .rename(columns={"tags": "Tag", "project_count": "Project Count", "loanAmount_mean": "Average Loan Amount"})
    .reset_index(drop=True)
    .reset_index()
)
merged["index"] = merged["index"] + 1
merged.rename(columns={"index": "No."}, inplace=True)

In [None]:
print(merged.to_latex(index=False).replace("<NA>", "-").replace("NaN", "-"))

# Distribution of project across Sectors

In [None]:
psec = PT.groupby("sector_name").agg({"project_id": "nunique", "loanAmount": "mean"}).reset_index()
psec.rename(columns={"project_id": "project_count", "loanAmount": "avg_loanAmount"}, inplace=True)
psec = psec.sort_values("project_count", ascending=False)
psec = psec.to_pandas()

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Bar(x=psec.sector_name, y=psec.avg_loanAmount, name="Average Loan Amount"),
    secondary_y=True,
)

fig.add_trace(
    go.Scatter(x=psec.sector_name, y=psec.project_count, name="Project Count", mode="lines"),
    secondary_y=False,
)

# Add figure title
# fig.update_layout(title_text="Projects per Sector")

# Set x-axis title
fig.update_xaxes(title_text="Sectors")

# Set y-axes titles
fig.update_yaxes(title_text="Project Count", secondary_y=False)
fig.update_yaxes(title_text="Average Loan Amount", secondary_y=True)

# Increase the size of the plot
fig.update_layout(height=400, width=1000, margin=dict(l=5, r=5, t=5, b=5))

# Set legend position
fig.update_layout(legend=dict(x=0.1, y=0.9))

# Set font
fig.update_layout(font_family="Computer Modern", font_size=14)

fig.write_image("images/project-vs-sector.pdf", format="pdf")

fig.show()

# Distribution of project across Countries

In [None]:
pcountry = PT.groupby("geocode_country_name").agg({"project_id": "nunique", "loanAmount": "mean"}).reset_index()
pcountry.rename(
    columns={"geocode_country_name": "country", "project_id": "project_count", "loanAmount": "avg_loanAmount"},
    inplace=True,
)
pcountry = pcountry.sort_values("project_count", ascending=False)
pcountry = pcountry.to_pandas()

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces

fig.add_trace(
    go.Bar(x=pcountry.country, y=pcountry.avg_loanAmount, name="Average Loan Amount"),
    secondary_y=True,
)

fig.add_trace(
    go.Scatter(x=pcountry.country, y=pcountry.project_count, name="Project Count", mode="lines"),
    secondary_y=False,
)

# Set x-axis title
fig.update_xaxes(
    title_text="Countries",
    # tickangle = 90,
    # tickvals = pcountry.country,
    # ticktext = pcountry.country
)

# Set y-axes titles
fig.update_yaxes(title_text="Project Count", secondary_y=False)
fig.update_yaxes(title_text="Average Loan Amount", secondary_y=True)

# Increase the size of the plot
fig.update_layout(height=400, width=1000, margin=dict(l=5, r=5, t=5, b=5))

# Set legend position
fig.update_layout(legend=dict(x=0.1, y=0.9))

# Set font
fig.update_layout(font_family="Computer Modern", font_size=14)

fig.write_image("images/project-vs-country.pdf", format="pdf")

fig.show()

In [None]:
pcountry_rounded = pcountry.drop(columns=["country.iso3"]).reset_index(drop=True).round(2)
pcountry_rounded.reset_index(inplace=True)
pcountry_rounded.rename(columns={"index": "No."}, inplace=True)
pcountry_rounded["No."] = pcountry_rounded["No."] + 1
pcountry_rounded.head()

In [None]:
# Split the dataframe into two smaller tables
pcountry_rounded_1 = pcountry_rounded.iloc[: len(pcountry_rounded) // 2].reset_index(drop=True)
pcountry_rounded_2 = pcountry_rounded.iloc[len(pcountry_rounded) // 2 :].reset_index(drop=True)
assert len(pcountry_rounded_1) + len(pcountry_rounded_2) == len(pcountry_rounded)

# merge the two tables horizontally

merged = pd.concat([pcountry_rounded_1, pcountry_rounded_2], axis=1)
merged = merged.convert_dtypes()
merged.tail()

In [None]:
print(merged.to_latex(index=False).replace("<NA>", ""))

In [None]:
import country_converter as coco

pcountry["country"] = pcountry["country"].replace("Congo (Rep.)", "Congo")
pcountry["country.iso3"] = coco.convert(pcountry.country, to="ISO3")

fig = px.choropleth(
    pcountry,
    locations="country.iso3",
    color="project_count",
    hover_name="country",
    projection="natural earth",
    title="Number of Projects by Country",
)

fig.update_layout(height=600, width=1000, margin=dict(l=5, r=5, t=50, b=5))
fig.update_layout(font_family="Computer Modern", font_size=18)


fig.write_image("images/project-vs-country-map.pdf", format="pdf")

fig.show()

# Number of Tags vs Time

In [None]:
time_df = PT.to_pandas()
time_df["date"] = time_df.progress_apply(
    lambda row: list(pd.period_range(row["fundraisingDate"], row["raisedDate"], freq="M")), axis=1
)
time_df.drop(columns=["fundraisingDate", "raisedDate"], inplace=True)
time_df = time_df.explode("date")
time_df["date"] = time_df["date"].dt.to_timestamp()

In [None]:
tag_counts = time_df[(time_df["tags"] != "empty") & (time_df["tags"] != "")]
tag_counts = tag_counts.groupby("date").agg({"tags": "count"})
tag_counts.head(1)

In [None]:
project_count = time_df.groupby("date").agg({"project_id": "nunique"})
project_count.head(1)

In [None]:
project_tag_count = tag_counts.merge(project_count, left_index=True, right_index=True)
project_tag_count.reset_index(inplace=True)
project_tag_count.head(1)

In [None]:
project_tag_count.rename(columns={"tags": "tag_count", "project_id": "project_count"}, inplace=True)

In [None]:
fig = px.histogram(project_tag_count, x="date", y=["project_id", "tags"], histfunc="avg", barmode="group")

In [None]:
fig = px.line(project_tag_count, x="date", y=["project_count", "tag_count"], title="")
fig.update_yaxes(title_text="Count")
fig.update_layout(height=400, width=1000, margin=dict(l=5, r=5, t=5, b=5))
fig.update_layout(legend=dict(x=0.1, y=0.9))
fig.update_layout(font_family="Computer Modern", font_size=14)
fig.write_image("images/tag-project-vs-time.pdf", format="pdf")
fig.update_xaxes(tickmode="linear", dtick="M12")
fig.show()

In [None]:
time_df.head()

In [None]:
fig = px.histogram(time_df, x="date", y=["project_id", "tags"], color="tags", title="")
fig.show()