In [None]:
import pandas as pd
from typing import Dict, Set
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

tqdm.pandas()

In [None]:
df = pd.read_parquet("../data/gen/preprocessed_2023-08-28T11-09-39.parquet")

In [None]:
COUNTRY = "Philippines"

In [None]:
vn = df[df["geocode_country_name"] == COUNTRY]
vn.head(2)

In [None]:
each_year = vn.groupby(df["fundraisingDate"].dt.year).agg({"lender_id": set})
each_year["lender_count"] = each_year["lender_id"].apply(len)
each_year.reset_index(inplace=True)
each_year.tail()

In [None]:
pairs = []
for r1 in tqdm(each_year.itertuples()):
    for r2 in each_year.itertuples():
        first = r1.fundraisingDate
        second = r2.fundraisingDate
        intersec = len(r1.lender_id.intersection(r2.lender_id))
        union = len(r1.lender_id.union(r2.lender_id))
        pairs.append((first, second, intersec, union))

pairs = pd.DataFrame(pairs, columns=["first", "second", "intersec", "union"])
pairs["iou"] = pairs["intersec"] / pairs["union"]
pairs

In [None]:
# Reshape the data using pivot
heatmap_data = pairs.pivot(index="first", columns="second", values="iou")

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, cmap="coolwarm", annot=True, fmt=".2f")
plt.title("Lender IOU " + COUNTRY)
plt.show()

In [None]:
many_intersec = {}
many_union = {}


def fintersection(first: int, second: int) -> Set:
    if (first, second) in many_intersec:
        return many_intersec[(first, second)]
    if (second, first) in many_intersec:
        return many_intersec[(second, first)]
    if first == second:
        i = each_year[each_year["fundraisingDate"] == first]["lender_id"].values[0]
    else:
        a: Dict = fintersection(first, second - 1)
        b: Dict = each_year[each_year["fundraisingDate"] == second]["lender_id"].values[0]
        i = a.intersection(b)
    many_intersec[(first, second)] = i
    return i


def funion(first: int, second: int) -> Set:
    if (first, second) in many_union:
        return many_union[(first, second)]
    if (second, first) in many_union:
        return many_union[(second, first)]
    if first == second:
        u = each_year[each_year["fundraisingDate"] == first]["lender_id"].values[0]
    else:
        a: Dict = funion(first, second - 1)
        b: Dict = each_year[each_year["fundraisingDate"] == second]["lender_id"].values[0]
        u = a.union(b)
    many_union[(first, second)] = u
    return u


range_intersection = []
for start_year in range(2010, 2024):
    for end_year in range(2010, 2024):
        i = fintersection(start_year, end_year)
        u = funion(start_year, end_year)
        range_intersection.append((start_year, end_year, len(i), len(u)))

range_intersection = pd.DataFrame(range_intersection, columns=["start", "end", "intersec", "union"])
range_intersection["iou"] = range_intersection["intersec"] / range_intersection["union"]
heatmap_data = range_intersection.pivot(index="start", columns="end", values="iou")

In [None]:
plt.figure(figsize=(10, 8))
# sns.heatmap(heatmap_data, cmap="coolwarm", annot=True, fmt=".0f", annot_kws={"fontsize": 8})
sns.heatmap(heatmap_data, cmap="coolwarm", annot=True, fmt=".2f")
plt.title("Lender IOU (from Start to End Year) " + COUNTRY)
plt.xlabel("End Year")  # Add x-axis title
plt.ylabel("Start Year")  # Add y-axis title
plt.show()

# Try again for all country, find country with the best 

In [None]:
def maximum_year_correlation_for_country(country: str):
    vn = df[df["geocode_country_name"] == country]
    each_year = vn.groupby(df["fundraisingDate"].dt.year).agg({"lender_id": set})
    each_year["lender_count"] = each_year["lender_id"].apply(len)
    each_year.reset_index(inplace=True)

    pairs = []
    for r1 in each_year.itertuples():
        for r2 in each_year.itertuples():
            if r2 <= r1:
                continue
            first = r1.fundraisingDate
            second = r2.fundraisingDate
            intersec = len(r1.lender_id.intersection(r2.lender_id))
            union = len(r1.lender_id.union(r2.lender_id))
            pairs.append((first, second, intersec, union))

    pairs = pd.DataFrame(pairs, columns=["first", "second", "intersec", "union"])
    pairs["iou"] = pairs["intersec"] / pairs["union"]
    m = pairs.iou.max()
    return m


maximum_year_correlation_for_country("Vietnam")

In [None]:
country_correlations = []
for country in tqdm(df.geocode_country_name.unique()):
    country_correlations.append((country, maximum_year_correlation_for_country(country)))

country_correlations = pd.DataFrame(country_correlations, columns=["country", "max_iou_yearpair"])
country_correlations.sort_values(by="max_iou_yearpair", ascending=False, inplace=True)
country_correlations.head(10)

In [None]:
print(country_correlations.head(10).to_markdown())