# Kiva - overview tags

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [None]:
import numpy as np
import pandas as pd
import cudf
import plotly.express as px
from tqdm import tqdm

tqdm.pandas()

# Import raw data
First, read data in `.jsonl` file format as a pandas data frame
Then store the dataframe in `.parquet` format for easy access later

In [None]:
# ds = cudf.read_parquet("../fulldata/kiva_2023-08-10T17-57-12.parquet")
ds = cudf.read_parquet("../fulldata/kiva_2023-08-20T16-16-43.parquet")

In [None]:
ds.isna().all(axis=1).sum()

In [None]:
ds.dropna(axis=0, how="all", inplace=True)

store the name of interesting columns for easy access

In [None]:
class COL:
    LOAN_AMOUNT = "loanAmount"
    FUNDED_AMOUNT = "loanFundraisingInfo.fundedAmount"
    RAISED_DATE = "raisedDate"
    POSTED_DATE = "fundraisingDate"
    TAGS = "tags"
    COUNTRY_NAME = "geocode.country.name"
    COUNTRY = "geocode.country.isoCode"
    REGION = "geocode.country.region"
    STATE = "geocode.state"
    LAT = "geocode.latitude"
    LONG = "geocode.longitude"
    SPEED = "collection_speed"

In [None]:
ds.columns

In [None]:
ds.index.duplicated().sum()

In [None]:
ds = ds[
    [
        COL.LOAN_AMOUNT,
        COL.FUNDED_AMOUNT,
        COL.RAISED_DATE,
        COL.POSTED_DATE,
        # "disbursalDate",
        COL.COUNTRY_NAME,
        COL.COUNTRY,
        COL.STATE,
        COL.REGION,
        COL.LAT,
        COL.LONG,
        COL.TAGS,
    ]
]

ds.tail()

In [None]:
ds[COL.LOAN_AMOUNT] = ds[COL.LOAN_AMOUNT].astype("float32")
ds[COL.FUNDED_AMOUNT] = ds[COL.FUNDED_AMOUNT].astype("float32")
ds[COL.RAISED_DATE] = cudf.to_datetime(ds[COL.RAISED_DATE], format="%Y-%m-%dT%H:%M:%SZ")
ds[COL.POSTED_DATE] = cudf.to_datetime(ds[COL.POSTED_DATE], format="%Y-%m-%dT%H:%M:%SZ")
ds[COL.COUNTRY] = ds[COL.COUNTRY].astype("category")
ds[COL.COUNTRY_NAME] = ds[COL.COUNTRY_NAME].astype("category")
ds[COL.REGION] = ds[COL.REGION].astype("category")
ds[COL.STATE] = ds[COL.STATE].astype("category")
ds[COL.LAT] = ds[COL.LAT].astype("float32")
ds[COL.LONG] = ds[COL.LONG].astype("float32")

In [None]:
ds.info()

In [None]:
ds.index.duplicated().sum()

# Preprocessing

## We keep only the success loans

In [None]:
success = ds[COL.LOAN_AMOUNT] == ds[COL.FUNDED_AMOUNT]
counts = success.value_counts()
counts[True] / (counts[True] + counts[False]), len(ds)

In [None]:
# keep success only
ds = ds[success]

## Drop some NaN

In [None]:
ds.isna().sum()

In [None]:
ds.dropna(subset=[COL.LOAN_AMOUNT, COL.FUNDED_AMOUNT, COL.POSTED_DATE, COL.RAISED_DATE], inplace=True)

## Collection Speed

In [None]:
ds["funding_duration"] = ds[COL.RAISED_DATE] - ds[COL.POSTED_DATE]
ds["funding_duration_days"] = ds["funding_duration"].astype("int64") / pow(10, 9)  # seconds
ds["funding_duration_days"] = ds["funding_duration_days"] / (24 * 60 * 60)
ds[COL.SPEED] = ds[COL.FUNDED_AMOUNT] / ds["funding_duration_days"]
ds.head()

There are some projects which are fullfilled before being published.  
Let's show them and then get rid of them

In [None]:
# some project is already fulfilled before publish
ds[ds[COL.SPEED] < 0]

In [None]:
ds = ds[ds[COL.SPEED] >= 0]

## Encode `tags` using MultiLabelBinarizer

In [None]:
tagdf = ds[["tags"]].to_pandas()
tagdf.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

lb = MultiLabelBinarizer()

mlb = lb.fit_transform(tagdf["tags"])
mlb.shape

In [None]:
tags_columns = ["tag_" + i for i in lb.classes_]
tag_ds = cudf.DataFrame(mlb, columns=tags_columns, dtype="int8", index=tagdf.index)
del tagdf
tag_ds.sum()

In [None]:
tag_ds.columns

In [None]:
# join with the original df
ds = ds.join(tag_ds)
del tag_ds
ds.head()

In [None]:
# sanity check here
tag_columns = [a for a in ds.columns if a.startswith("tag_")]

for testcase in range(50):
    sam = ds.sample(1)
    tags_list = sam["tags"].iloc[0]
    for atag in tag_columns:
        if atag.replace("tag_", "") in tags_list:
            assert sam[atag].iloc[0] == 1
        else:
            assert sam[atag].iloc[0] == 0

In [None]:
# drop the orignal `tags` columns
ds.drop(["tags"], axis=1, inplace=True)

## Quickly refine `tags`

### merge 'tag_#Eco-friendly' and 'tag_#EcoFriendly'

In [None]:
ds["tag_#EcoFriendly"] = ((ds["tag_#Eco-friendly"] + ds["tag_#EcoFriendly"]) > 0).astype("int8")
ds.drop("tag_#Eco-friendly", axis=1, inplace=True)

### Keep tags visibled to users

Take a look at this screenshot. We notice that some tags in the dataframe are not display in the website

![Alt text](images/screenshot_kiva_20230927_filter_tags.png)

These undisplayed tags are `tag_user_favorite`, `tag_user_like`, `tag_volunteer_like`, `tag_volunteer_pick`. It might because those are used internally in the kiva platform

And because those tags are not being shown to Lender, we could get rid of them here

In [None]:
# drop some meaningless tags
ds.drop(
    ["tag_", "tag_user_favorite", "tag_user_like", "tag_volunteer_like", "tag_volunteer_pick"], axis=1, inplace=True
)

In [None]:
ds.info()

In [None]:
ds.isna().sum().sort_values()

# Now drawing

## Tags vs time

In [None]:
time_df = ds[[COL.POSTED_DATE, COL.RAISED_DATE]].to_pandas()
time_df["date"] = time_df.progress_apply(
    lambda row: list(pd.date_range(row[COL.POSTED_DATE], row[COL.RAISED_DATE])), axis=1
)
time_ds = cudf.from_pandas(time_df)
del time_df
time_ds.head()

In [None]:
time_ds = ds.merge(time_ds, left_index=True, right_index=True)
time_ds.head()

In [None]:
time_ds = time_ds.explode("date")
time_ds["date"] = time_ds["date"].dt.floor("D")
time_ds.head()

In [None]:
tag_columns = [a for a in ds.columns if a.startswith("tag_")]
tag_counts = []
for atag in tqdm(tag_columns):
    temp = time_ds[time_ds[atag] == 1].date.value_counts().rename(atag)
    tag_counts.append(temp)
tag_counts_concat = cudf.concat(tag_counts, axis=1)
tag_counts_concat

In [None]:
date_range = list(pd.date_range(ds[COL.POSTED_DATE].min(), ds[COL.RAISED_DATE].max(), normalize=True))
tag_hist = cudf.DataFrame({"date": date_range})
tag_hist = tag_hist.set_index("date")
tag_hist = tag_hist.merge(tag_counts_concat, left_index=True, right_index=True, how="outer")
tag_hist_display = tag_hist.reset_index().melt(id_vars=["date"], var_name="tag")
tag_hist_display.dropna(inplace=True)
tag_hist_display = tag_hist_display.to_pandas()
tag_hist_display

In [None]:
fig = px.histogram(
    tag_hist_display, x="date", y="value", color="tag", barmode="overlay", opacity=0.3, histnorm="percent", height=800
)
fig.show()

from above figure, we could see that some tags are only happend in a short timeframe, e.g
- `#Married`
- `Post-disbursed` ???
- `Salesforce`
- `beauty`
- `Viral`
- `MUFG`
- `reversed_crisis_support_loan`


## Number of project vs tag

In [None]:
tag_columns = [a for a in ds.columns if a.startswith("tag_")]


class My:
    FIG_H = 800
    FIG_W = None

In [None]:
project_count_per_tag = ds[tag_columns].sum().sort_values(ascending=False)
project_count_per_tag = project_count_per_tag.to_pandas()
project_count_per_tag.rename("number of project", inplace=True)

fig = px.bar(
    project_count_per_tag.head(20),
    orientation="v",
    text_auto=True,
    title="Number of Projects per Tag",
    height=My.FIG_H,
    width=My.FIG_W,
    labels={"x": "Categories", "y": "Number of Loans"},
)
fig.update_traces(showlegend=False)
fig.update_layout(xaxis_title=None, yaxis_title="Number of Projects")
fig.show()

## Number of project vs country

In [None]:
proj_per_country = ds.groupby(by=["geocode.country.name"]).count()["loanAmount"].sort_values(ascending=False)
proj_per_country = proj_per_country.to_pandas()
proj_per_country.rename("number of project", inplace=True)

fig = px.bar(
    proj_per_country.head(20),
    orientation="v",
    text_auto=True,
    title="Number of Projects per Country",
    height=My.FIG_H,
    width=My.FIG_W,
    labels={"x": "Categories", "y": "Values"},
)
fig.update_traces(showlegend=False)
fig.update_layout(xaxis_title="Country", yaxis_title="Number of Projects")
fig.show()

## Collection Speed vs Tag

In [None]:
def get_tag_performance(_df: pd.DataFrame, num_tag: int = 10) -> pd.DataFrame:
                 """get speed performance by tags, keep only first `num_tag`"""
    tags_performances = []

    for atag in tag_columns:
        mean = _df[_df[atag] == 1][COL.SPEED].mean()
        std = _df[_df[atag] == 1][COL.SPEED].std()
        count = _df[_df[atag] == 1][COL.SPEED].count()
        tags_performances.append({"tag": atag, "speed_mean": mean, "speed_std": std, "count": count})

    tags_performances = pd.DataFrame(tags_performances)
    tags_performances.dropna(subset=["speed_mean"], inplace=True)
    tags_performances.fillna(0, inplace=True)
    tags_performances.sort_values("speed_mean", inplace=True, ascending=False)
    tags_performances.set_index("tag", inplace=True)
    return tags_performances.head(num_tag)


# fig = px.bar(tags_performances, y="tag", x="mean", error_x="std", text_auto=True)
fig = px.bar(get_tag_performance(ds, 20), y="speed_mean", text_auto=True, title="Collection Speed Mean per Tag", width=My.FIG_W, height=My.FIG_H)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
fig = px.bar(get_tag_performance(ds, 20), y="speed_mean",
text_auto=True, title="Collection Speed Mean per Tag", width=My.FIG_W, height=My.FIG_H)
fig.update_traces(te            xtfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

### Colection speed vs Tags for Countries

In [None]:
"""convert coutry code to country name"""
code_to_name = ds[[COL.COUNTRY_NAME, COL.COUNTRY]].drop_duplicates()
code_to_name.set_index(COL.COUNTRY, inplace=True)
code_to_name = code_to_name.to_dict()[COL.COUNTRY_NAME]
assert code_to_name["VN"] == "Vietnam"

In [None]:
country_code = "VN"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=26, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
country_code = "KE"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
country_code = "KH"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
country_code = "PK"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
country_code = "SV"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

Correlation between tags and collection speed

In [None]:
tag_df = ds[tag_columns].to_pandas()
corr = tag_df.corrwith(ds[COL.SPEED].to_pandas(), method="kendall", drop=True)
corr.rename("correlation", inplace=True)
fig = px.bar(
    corr.sort_values(ascending=False),
    orientation="h",
    text_auto=True,
    title="Correlation betwene Tags and Collection Speed",
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(xaxis_title="kendall correlation score", yaxis_title=None)
fig.update_traces(showlegend=False)
fig.show()

### Influence of the number of tags

Above-average speed vs number of tags

In [None]:
tag_count = ds[tag_columns].sum(axis=1)
tag_count_df = tag_count.value_counts()
tag_count_mean = tag_count.mean()
tag_count_std = tag_count.std()
fig = px.bar(
    tag_count_df.to_pandas(),
    text_auto=True,
    title="Distribution of Number of Tags per Project",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_xaxes(tickmode="linear")
fig.update_layout(xaxis_title="Number of Tags", yaxis_title="Number of Project")
fig.update_traces(showlegend=False)
fig.add_vline(x=tag_count_mean)
fig.show()

In [None]:
# Number of above-average speed collection vs number of tags
speed_mean = ds[COL.SPEED].mean()
is_above_average = ds[COL.SPEED] >= speed_mean

In [None]:
tag_count_df = tag_count.to_frame(name="tag_count").join(is_above_average.rename("is_above_average"))

In [None]:
mn = tag_count_df.groupby("tag_count").agg(["sum", "count"])

In [None]:
mn["percentage"] = mn["is_above_average"]["sum"] / mn["is_above_average"]["count"] * 100

In [None]:
fig = px.bar(
    mn["percentage"],
    text_auto=True,
    title="Projects with above-average collection speed based on the number of tags",
    labels={"value": "Percentage of Project with Collection Speed above Global Average"},
    height=My.FIG_H,
)
# fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_traces(showlegend=False)
fig.update_xaxes(tickmode="linear")
fig.show()

Effectiveness of Tags on Top 5% Collection Speed

## Pair of tags Performance

In [None]:
from itertools import combinations

In [None]:
tag2_columns = []
for apair in combinations(tag_columns, 2):
    col_name = "__".join(apair)
    tag2_columns.append(col_name)
    ds[col_name] = ds[apair[0]] & ds[apair[1]]
assert ds[tag2_columns].max().max() == 1
assert ds[tag2_columns].min().min() == 0

In [None]:
speed_mean = ds[COL.SPEED].mean()
above_average = ds[ds[COL.SPEED] >= speed_mean]

In [None]:
tag2_performances = []

for apair in tqdm(tag2_columns):
    temp = above_average[apair] * above_average[COL.SPEED]
    mean = temp.mean()
    std = temp.std()
    tag2_performances.append({"tag": apair, "mean": mean, "std": std})

In [None]:
la = pd.DataFrame(tag2_performances)
la = la.dropna(subset=["mean"]).sort_values("mean", ascending=True)
la = la.tail(20)
la

In [None]:
fig = px.bar(la, y="tag", x="mean", title="Effectiveness of Tags on Collection Speed", width=My.FIG_W, height=My.FIG_H)
# fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
# fig.update_xaxes(tickmode='linear')
# fig.add_vline(x=tag_count_mean)
fig.show()

# Save the results

In [None]:
# convert to html to easiy read
!jupyter nbconvert --to html alldata_cudf.ipynb