In [53]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [54]:
kpi_processed_file = '../../data/KPIs_data_processed_2025-09-01.csv'
df = pd.read_csv(kpi_processed_file)
df.head()

Unnamed: 0,accounts_group,reg_month,country,product_group,area,primary_product,count_units,avg_planned_repayment_days,total_follow_on_revenue_current_usd,total_follow_on_revenue,...,disabled_rate_630,disabled_rate_720,age_in_months,exchange,total_follow_on_revenue_usd_cal,total_follow_on_revenue_usd_final,avg_upfront_price_usd,avg_unlock_price_usd,backtesting_limit,cohort_age_backtesting
0,2016-01_Kenya_Lanterns_Kakamega_Sun King Pro E...,2016-01,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy,57,77,1543.400856,199500.0,...,0.087719,0.052632,115,152.85,1305.2012,1305.2012,7.0,30.0,30,30
1,2016-02_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,2016-02,Kenya,Lanterns,Bungoma,Sun King Pro EasyBuy,18,77,487.389744,63000.0,...,0.222222,0.055556,114,152.85,412.1688,412.1688,7.0,30.0,30,30
2,2016-02_Kenya_Lanterns_Kakamega_Sun King Pro E...,2016-02,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy,287,77,7771.158696,1004500.0,...,0.108014,0.062718,114,152.85,6571.8024,6571.8024,7.0,30.0,30,30
3,2016-03_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,2016-03,Kenya,Lanterns,Bungoma,Sun King Pro EasyBuy,114,77,3086.801712,399000.0,...,0.131579,0.096491,113,152.85,2610.4024,2610.4024,7.0,30.0,30,30
4,2016-03_Kenya_Lanterns_Kakamega_Sun King Pro E...,2016-03,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy,142,77,3844.963536,497000.0,...,0.133803,0.112676,113,152.85,3251.5538,3251.5538,7.0,30.0,30,30


In [55]:


# Example horizon checkpoints (update if yours differ)
FRR_HORIZONS = [30, 60, 90, 180, 270, 360, 450, 540]
RS_HORIZONS  = [30, 60, 90, 180, 270, 360, 450, 540]  # repayment_speed
AR_HORIZONS  = [30, 60, 90, 180, 270, 360, 450, 540]  # at_risk_rate

def add_delta_and_growth_features(df, prefix, horizons):
    """
    For each metric (e.g. frr, repayment_speed, at_risk_rate):
      - Adds deltas: x_t2 - x_t1
      - Adds growth rates: (x_t2 - x_t1)/x_t1
    """
    for i in range(1, len(horizons)):
        t1, t2 = horizons[i-1], horizons[i]
        col1, col2 = f"{prefix}_{t1}", f"{prefix}_{t2}"
        if col1 in df.columns and col2 in df.columns:
            # delta
            df[f"{prefix}_delta_{t2}_{t1}"] = df[col2] - df[col1]
            # growth (guard against zero/NaN)
            df[f"{prefix}_growth_{t1}_{t2}"] = np.where(
                (df[col1].notna()) & (df[col1] != 0),
                (df[col2] - df[col1]) / df[col1],
                np.nan
            )
    return df

def add_slope_features(df, prefix, horizons):
    """
    Fit a simple OLS line per row: metric_t ~ horizon.
    Adds slope & intercept per metric type.
    """
    slopes, intercepts = [], []
    X = np.array(horizons).reshape(-1, 1)

    for _, row in df.iterrows():
        y = row[[f"{prefix}_{h}" for h in horizons]].values.astype(float)
        mask = ~np.isnan(y)
        if mask.sum() >= 2:
            reg = LinearRegression().fit(X[mask], y[mask])
            slopes.append(reg.coef_[0])
            intercepts.append(reg.intercept_)
        else:
            slopes.append(np.nan)
            intercepts.append(np.nan)

    df[f"{prefix}_slope"] = slopes
    df[f"{prefix}_intercept"] = intercepts
    return df

def add_early_vs_late_ratio(df, prefix, early=90, late=360):
    """
    Adds ratio of early repayment/realization to later.
    Example: frr_90 / frr_360
    """
    col_early, col_late = f"{prefix}_{early}", f"{prefix}_{late}"
    if col_early in df.columns and col_late in df.columns:
        df[f"{prefix}_ratio_{early}_{late}"] = np.where(
            (df[col_late].notna()) & (df[col_late] != 0),
            df[col_early] / df[col_late],
            np.nan
        )
    return df

# -----------------------------
# Apply pipeline
# -----------------------------
def engineer_features(df):
    # 1. Deltas & growths
    df = add_delta_and_growth_features(df, "frr", FRR_HORIZONS)
    # df = add_delta_and_growth_features(df, "repayment_speed", RS_HORIZONS)
    # df = add_delta_and_growth_features(df, "at_risk_rate", AR_HORIZONS)

    # 2. Slopes
    df = add_slope_features(df, "frr", FRR_HORIZONS)
    # df = add_slope_features(df, "repayment_speed", RS_HORIZONS)

    # 3. Early vs late ratios
    df = add_early_vs_late_ratio(df, "frr", 90, 360)
    # df = add_early_vs_late_ratio(df, "repayment_speed", 90, 360)

    return df

In [56]:
df = engineer_features(df)

In [57]:
kpi_processed_file = 'new2_KPIs_data_processed_2025-09-01.csv'

In [58]:
df.to_csv(kpi_processed_file, index=False)