In [40]:
from setup_notebook import setup_project_path
setup_project_path()

In [41]:
from src.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR

In [42]:
import pandas as pd
import argparse
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta
import numpy as np
import boto3
import gc
import datetime as dt
import io
from io import StringIO
import os
import dask
from dask import delayed
pd.set_option('display.max_columns',100)

In [43]:
data_dir = PROCESSED_DATA_DIR
models_dir = MODELS_DIR

os.makedirs(data_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

In [44]:
def get_training_cut_off_month(df):
    
    last_day_of_prev_month = date.today().replace(day=1) - timedelta(days=2)
    cut_off_date_3_years = last_day_of_prev_month - timedelta(days=365*3)
    
    training_cut_off_date = cut_off_date_3_years - relativedelta(months=6)
    
    training_cut_off_date = str(training_cut_off_date).split('-')
    training_cut_off_month = training_cut_off_date[:2]
    
    training_cut_off_month = '-'.join(training_cut_off_month)
    
    print('Cut off month for training data: ',training_cut_off_month)

    return training_cut_off_month, cut_off_date_3_years


def preprocessing_common(df_KPI):

    #Removing account_group duplicates 
    df_KPI.drop_duplicates(subset=['accounts_group'], keep='last', ignore_index=True)
    
    # Get cohort details
    cohort_details = df_KPI["accounts_group"].str.split("_", n = 5, expand = True)
    df_KPI["reg_month"] = cohort_details[0]
    df_KPI["country"] = cohort_details[1]
    df_KPI["product_group"] = cohort_details[2]
    df_KPI["area"] = cohort_details[3]
    df_KPI["primary_product"] = cohort_details[4]
    
    
    df_KPI['avg_planned_repayment_days'] = df_KPI['avg_planned_repayment_days'].fillna(-1)
    df_KPI['avg_planned_repayment_days'] = df_KPI['avg_planned_repayment_days'].astype(int)
    df_KPI.loc[df_KPI['frr_3_years']>1, 'frr_3_years'] = 1
    print('Shape of cohorts having frr_3_years <=0 is {}'.format(df_KPI[df_KPI['frr_3_years']<=0].shape[0]))
    df_KPI = df_KPI.loc[df_KPI['frr_3_years']>0]
    
    # Removing records having null FRR at 30 days
    print('Shape of cohorts having frr_30 null is {}'.format(df_KPI[df_KPI['frr_30'].isna()==True].shape[0]))
    df_KPI = df_KPI.loc[df_KPI['frr_30'].notnull()].copy()
    df_KPI.reset_index(drop=True, inplace=True)

    for limit in [30,60,90,180,270,360,450,540,630,720]:
        df_KPI.loc[df_KPI['frr_'+str(limit)+'']>1, 'frr_'+str(limit)+''] = 1
        df_KPI.loc[(df_KPI['at_risk_rate_'+str(limit)+''].isnull()) &(df_KPI['frr_'+str(limit)+'']==1),'at_risk_rate_'+str(limit)+''] = 0
    
    df_KPI.reset_index(drop=True, inplace=True)
    
    return df_KPI


def backtesting_split(data, backtesting):

    df_backtesting = data.copy()
    df_backtesting['backtesting'] = backtesting

    for col_prefix in ['frr', 'repayment_speed', 'avg_cum_days_disabled',
                  'at_risk_rate', 'disabled_gt_two_week_rate', 'unlocked_rate', 'disabled_rate']:
    
        df_backtesting[col_prefix] = np.nan

        df_backtesting.loc[df_backtesting['backtesting']==30, col_prefix] = df_backtesting[col_prefix+'_'+str(30)]
        df_backtesting.loc[df_backtesting['backtesting']==60, col_prefix] = df_backtesting[col_prefix+'_'+str(60)]
        df_backtesting.loc[df_backtesting['backtesting']==90, col_prefix] = df_backtesting[col_prefix+'_'+str(90)]
        df_backtesting.loc[df_backtesting['backtesting']==180, col_prefix] = df_backtesting[col_prefix+'_'+str(180)]
        df_backtesting.loc[df_backtesting['backtesting']==270, col_prefix] = df_backtesting[col_prefix+'_'+str(270)]
        df_backtesting.loc[df_backtesting['backtesting']==360, col_prefix] = df_backtesting[col_prefix+'_'+str(360)]
        df_backtesting.loc[df_backtesting['backtesting']==450, col_prefix] = df_backtesting[col_prefix+'_'+str(450)]
        df_backtesting.loc[df_backtesting['backtesting']==540, col_prefix] = df_backtesting[col_prefix+'_'+str(540)]
        df_backtesting.loc[df_backtesting['backtesting']==630, col_prefix] = df_backtesting[col_prefix+'_'+str(630)]
        df_backtesting.loc[df_backtesting['backtesting']==720, col_prefix] = df_backtesting[col_prefix+'_'+str(720)]


    # Removing the old temporal features
    cols_to_remove = [col for col in df_backtesting.columns if '30' in col or '60' in col or '90' in col or '180' in col or '270' in col or '360' in col or '450' in col or '540' in col or '630' in col or '720' in col]
    df_backtesting.drop(cols_to_remove, axis=1, inplace=True)
    print('Shape of {} is {}'.format(backtesting, df_backtesting.shape))
    print(df_backtesting.head())

    return df_backtesting

def feature_engineering(data):
    
    # Creating column for Unlock price
    data['unlock_price_usd'] = data['upfront_price_usd'] + data['total_follow_on_revenue_usd']
    
    # Calculating averge unlock and upfront price

    data['avg_upfront_price_usd'] = np.round((data['upfront_price_usd']/data['count_units']),0)
    data['avg_unlock_price_usd'] = np.round((data['unlock_price_usd']/data['count_units']),0)
    
    # Removing unnecessary columns
    cols_to_remove = []
    cols_to_remove.append('upfront_price_usd')
    cols_to_remove.append('unlock_price_usd')

    print('columns to remove: ',cols_to_remove)
    
    data.drop(cols_to_remove, axis=1, inplace=True)
    
    print('Sample from data------------')
    print(data.sample(1))


    data = data.reset_index().drop(columns = 'index')
    
    return data
    

def create_train_test_validation_split(df_KPI):
    
    # Random shuffling
    df_KPI = df_KPI.sample(frac = 1)
    
    training_cut_off_date, cut_off_date_3_years = get_training_cut_off_month(df_KPI)
    training_cut_off_date = str(training_cut_off_date)
    cut_off_date_3_years = str(cut_off_date_3_years)
    df_train_test = df_KPI.loc[df_KPI['reg_month']<=training_cut_off_date]
    df_oot_validation = df_KPI.loc[(df_KPI['reg_month']>training_cut_off_date) & (df_KPI['reg_month']<=cut_off_date_3_years)]
    
    # Create train set using 85% rows randomly
    df_train = df_train_test.sample(frac = 0.85, random_state=100)

    # Create test set using remaining 15% rows
    df_test = df_train_test.drop(df_train.index)
    
    print('Num of rows in train subset: ',df_train.shape[0])
    print('Num of rows in test subset: ',df_test.shape[0])
    print('Num of rows in validation subset: ',df_oot_validation.shape[0])
    
    print('Last registration month in training: ',list(df_train.sort_values(by='reg_month', ascending=False)['reg_month'])[0])
    print('Last registration month in validation: ',list(df_oot_validation.sort_values(by='reg_month', ascending=False)['reg_month'])[0])
    
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    df_oot_validation.reset_index(drop=True, inplace=True)
    
    return df_train, df_test, df_oot_validation

## Main flow

In [45]:
df_KPI = pd.read_csv(os.path.join(data_dir,'KPIs_data_modelling_2025-09-18.csv'))

In [46]:
print('Shape of the combined dataset: ',df_KPI.shape)

Shape of the combined dataset:  (85712, 77)


In [47]:
df_KPI.head()

Unnamed: 0,accounts_group,count_units,upfront_price_usd,avg_planned_repayment_days,frr_30,frr_60,frr_90,frr_180,frr_270,frr_360,frr_450,frr_540,frr_630,frr_720,repayment_speed_30,repayment_speed_60,repayment_speed_90,repayment_speed_180,repayment_speed_270,repayment_speed_360,repayment_speed_450,repayment_speed_540,repayment_speed_630,repayment_speed_720,avg_cum_days_disabled_30,avg_cum_days_disabled_60,avg_cum_days_disabled_90,avg_cum_days_disabled_180,avg_cum_days_disabled_270,avg_cum_days_disabled_360,avg_cum_days_disabled_450,avg_cum_days_disabled_540,avg_cum_days_disabled_630,avg_cum_days_disabled_720,at_risk_rate_30,at_risk_rate_60,at_risk_rate_90,at_risk_rate_180,at_risk_rate_270,at_risk_rate_360,at_risk_rate_450,at_risk_rate_540,at_risk_rate_630,at_risk_rate_720,disabled_gt_two_week_rate_30,disabled_gt_two_week_rate_60,disabled_gt_two_week_rate_90,disabled_gt_two_week_rate_180,disabled_gt_two_week_rate_270,disabled_gt_two_week_rate_360,disabled_gt_two_week_rate_450,disabled_gt_two_week_rate_540,disabled_gt_two_week_rate_630,disabled_gt_two_week_rate_720,unlocked_rate_30,unlocked_rate_60,unlocked_rate_90,unlocked_rate_180,unlocked_rate_270,unlocked_rate_360,unlocked_rate_450,unlocked_rate_540,unlocked_rate_630,unlocked_rate_720,disabled_rate_30,disabled_rate_60,disabled_rate_90,disabled_rate_180,disabled_rate_270,disabled_rate_360,disabled_rate_450,disabled_rate_540,disabled_rate_630,disabled_rate_720,frr_3_years,actual_fr,total_follow_on_revenue_usd
0,2016-01_Kenya_Lanterns_Kakamega_Sun King Pro E...,57,403.243821,77.0,0.35218,0.626566,0.785424,0.888827,0.924767,0.935544,0.945594,0.963138,0.964892,0.971333,1.0718,0.8275,0.7854,0.8888,0.9247,0.9355,0.9455,0.9631,0.9648,0.9713,1.0,5.0,11.0,25.0,33.0,42.0,43.0,51.0,57.0,69.0,0.181818,0.08,0.148148,0.125,0.222222,0.25,0.333333,0.166667,0.2,0.333333,0.017544,0.122807,0.175439,0.157895,0.157895,0.105263,0.105263,0.087719,0.087719,0.052632,0.035088,0.122807,0.526316,0.719298,0.842105,0.859649,0.894737,0.894737,0.912281,0.947368,0.263158,0.315789,0.368421,0.22807,0.157895,0.122807,0.105263,0.105263,0.087719,0.052632,0.977148,1386.031419,1418.445741
1,2016-02_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,18,127.340154,77.0,0.425714,0.686587,0.807381,0.849048,0.856984,0.856984,0.856984,0.856984,0.892698,0.90381,1.2956,0.9068,0.8073,0.849,0.8569,0.8569,0.8569,0.8569,0.8926,0.9038,0.0,3.0,8.0,14.0,14.0,14.0,14.0,14.0,102.0,105.0,0.055556,0.0625,0.090909,0.166667,0.166667,0.166667,0.166667,0.166667,0.25,1.0,0.0,0.111111,0.277778,0.333333,0.333333,0.333333,0.333333,0.333333,0.166667,0.055556,0.0,0.111111,0.388889,0.666667,0.666667,0.666667,0.666667,0.666667,0.777778,0.833333,0.277778,0.388889,0.444444,0.333333,0.333333,0.333333,0.333333,0.333333,0.222222,0.055556,0.90381,404.843825,447.930234
2,2016-02_Kenya_Lanterns_Kakamega_Sun King Pro E...,287,2030.368011,77.0,0.365668,0.618819,0.763085,0.867551,0.897489,0.917509,0.924428,0.933954,0.938175,0.951517,1.1129,0.8173,0.763,0.8675,0.8974,0.9175,0.9244,0.9339,0.9381,0.9515,1.0,7.0,13.0,32.0,43.0,55.0,63.0,72.0,80.0,90.0,0.156364,0.113725,0.130435,0.166667,0.196721,0.22449,0.244444,0.263158,0.272727,0.2,0.045296,0.087108,0.198606,0.205575,0.163763,0.139373,0.132404,0.114983,0.101045,0.059233,0.041812,0.111498,0.439024,0.707317,0.787456,0.829268,0.843206,0.867596,0.878049,0.909408,0.229965,0.341463,0.38676,0.268293,0.188153,0.163763,0.15331,0.12892,0.108014,0.062718,0.958262,6843.905988,7141.998731
3,2016-03_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,114,806.487642,77.0,0.341451,0.566825,0.706045,0.812436,0.847875,0.866647,0.876922,0.888125,0.903764,0.90998,1.0391,0.7486,0.706,0.8124,0.8478,0.8666,0.8769,0.8881,0.9037,0.9099,1.0,5.0,11.0,29.0,44.0,54.0,63.0,80.0,93.0,101.0,0.157407,0.153846,0.176471,0.243902,0.28125,0.346154,0.32,0.217391,0.2,0.181818,0.035088,0.263158,0.280702,0.254386,0.219298,0.210526,0.201754,0.192982,0.087719,0.096491,0.052632,0.087719,0.403509,0.640351,0.719298,0.77193,0.780702,0.798246,0.842105,0.850877,0.307018,0.482456,0.438596,0.307018,0.263158,0.219298,0.210526,0.201754,0.131579,0.096491,0.92503,2624.209728,2836.891482
4,2016-03_Kenya_Lanterns_Kakamega_Sun King Pro E...,142,1004.572326,77.0,0.358523,0.615414,0.759101,0.854278,0.878097,0.892322,0.905249,0.915813,0.925324,0.932185,1.0911,0.8128,0.7591,0.8542,0.878,0.8923,0.9052,0.9158,0.9253,0.9321,1.0,6.0,12.0,27.0,40.0,52.0,68.0,73.0,88.0,108.0,0.181159,0.141732,0.168831,0.225,0.147059,0.137931,0.153846,0.130435,0.157895,0.176471,0.035211,0.112676,0.211268,0.197183,0.183099,0.169014,0.133803,0.140845,0.098592,0.098592,0.028169,0.105634,0.457746,0.71831,0.753521,0.788732,0.809859,0.830986,0.852113,0.866197,0.225352,0.359155,0.380282,0.260563,0.225352,0.197183,0.169014,0.161972,0.133803,0.112676,0.945294,3340.358794,3533.671846


In [48]:
df_KPI.columns

Index(['accounts_group', 'count_units', 'upfront_price_usd',
       'avg_planned_repayment_days', 'frr_30', 'frr_60', 'frr_90', 'frr_180',
       'frr_270', 'frr_360', 'frr_450', 'frr_540', 'frr_630', 'frr_720',
       'repayment_speed_30', 'repayment_speed_60', 'repayment_speed_90',
       'repayment_speed_180', 'repayment_speed_270', 'repayment_speed_360',
       'repayment_speed_450', 'repayment_speed_540', 'repayment_speed_630',
       'repayment_speed_720', 'avg_cum_days_disabled_30',
       'avg_cum_days_disabled_60', 'avg_cum_days_disabled_90',
       'avg_cum_days_disabled_180', 'avg_cum_days_disabled_270',
       'avg_cum_days_disabled_360', 'avg_cum_days_disabled_450',
       'avg_cum_days_disabled_540', 'avg_cum_days_disabled_630',
       'avg_cum_days_disabled_720', 'at_risk_rate_30', 'at_risk_rate_60',
       'at_risk_rate_90', 'at_risk_rate_180', 'at_risk_rate_270',
       'at_risk_rate_360', 'at_risk_rate_450', 'at_risk_rate_540',
       'at_risk_rate_630', 'at_risk_rate

In [49]:
df_KPI = preprocessing_common(df_KPI)

Shape of cohorts having frr_3_years <=0 is 177
Shape of cohorts having frr_30 null is 0


In [50]:
df_KPI.head()

Unnamed: 0,accounts_group,count_units,upfront_price_usd,avg_planned_repayment_days,frr_30,frr_60,frr_90,frr_180,frr_270,frr_360,frr_450,frr_540,frr_630,frr_720,repayment_speed_30,repayment_speed_60,repayment_speed_90,repayment_speed_180,repayment_speed_270,repayment_speed_360,repayment_speed_450,repayment_speed_540,repayment_speed_630,repayment_speed_720,avg_cum_days_disabled_30,avg_cum_days_disabled_60,avg_cum_days_disabled_90,avg_cum_days_disabled_180,avg_cum_days_disabled_270,avg_cum_days_disabled_360,avg_cum_days_disabled_450,avg_cum_days_disabled_540,avg_cum_days_disabled_630,avg_cum_days_disabled_720,at_risk_rate_30,at_risk_rate_60,at_risk_rate_90,at_risk_rate_180,at_risk_rate_270,at_risk_rate_360,at_risk_rate_450,at_risk_rate_540,at_risk_rate_630,at_risk_rate_720,disabled_gt_two_week_rate_30,disabled_gt_two_week_rate_60,disabled_gt_two_week_rate_90,disabled_gt_two_week_rate_180,disabled_gt_two_week_rate_270,disabled_gt_two_week_rate_360,disabled_gt_two_week_rate_450,disabled_gt_two_week_rate_540,disabled_gt_two_week_rate_630,disabled_gt_two_week_rate_720,unlocked_rate_30,unlocked_rate_60,unlocked_rate_90,unlocked_rate_180,unlocked_rate_270,unlocked_rate_360,unlocked_rate_450,unlocked_rate_540,unlocked_rate_630,unlocked_rate_720,disabled_rate_30,disabled_rate_60,disabled_rate_90,disabled_rate_180,disabled_rate_270,disabled_rate_360,disabled_rate_450,disabled_rate_540,disabled_rate_630,disabled_rate_720,frr_3_years,actual_fr,total_follow_on_revenue_usd,reg_month,country,product_group,area,primary_product
0,2016-01_Kenya_Lanterns_Kakamega_Sun King Pro E...,57,403.243821,77,0.35218,0.626566,0.785424,0.888827,0.924767,0.935544,0.945594,0.963138,0.964892,0.971333,1.0718,0.8275,0.7854,0.8888,0.9247,0.9355,0.9455,0.9631,0.9648,0.9713,1.0,5.0,11.0,25.0,33.0,42.0,43.0,51.0,57.0,69.0,0.181818,0.08,0.148148,0.125,0.222222,0.25,0.333333,0.166667,0.2,0.333333,0.017544,0.122807,0.175439,0.157895,0.157895,0.105263,0.105263,0.087719,0.087719,0.052632,0.035088,0.122807,0.526316,0.719298,0.842105,0.859649,0.894737,0.894737,0.912281,0.947368,0.263158,0.315789,0.368421,0.22807,0.157895,0.122807,0.105263,0.105263,0.087719,0.052632,0.977148,1386.031419,1418.445741,2016-01,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy
1,2016-02_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,18,127.340154,77,0.425714,0.686587,0.807381,0.849048,0.856984,0.856984,0.856984,0.856984,0.892698,0.90381,1.2956,0.9068,0.8073,0.849,0.8569,0.8569,0.8569,0.8569,0.8926,0.9038,0.0,3.0,8.0,14.0,14.0,14.0,14.0,14.0,102.0,105.0,0.055556,0.0625,0.090909,0.166667,0.166667,0.166667,0.166667,0.166667,0.25,1.0,0.0,0.111111,0.277778,0.333333,0.333333,0.333333,0.333333,0.333333,0.166667,0.055556,0.0,0.111111,0.388889,0.666667,0.666667,0.666667,0.666667,0.666667,0.777778,0.833333,0.277778,0.388889,0.444444,0.333333,0.333333,0.333333,0.333333,0.333333,0.222222,0.055556,0.90381,404.843825,447.930234,2016-02,Kenya,Lanterns,Bungoma,Sun King Pro EasyBuy
2,2016-02_Kenya_Lanterns_Kakamega_Sun King Pro E...,287,2030.368011,77,0.365668,0.618819,0.763085,0.867551,0.897489,0.917509,0.924428,0.933954,0.938175,0.951517,1.1129,0.8173,0.763,0.8675,0.8974,0.9175,0.9244,0.9339,0.9381,0.9515,1.0,7.0,13.0,32.0,43.0,55.0,63.0,72.0,80.0,90.0,0.156364,0.113725,0.130435,0.166667,0.196721,0.22449,0.244444,0.263158,0.272727,0.2,0.045296,0.087108,0.198606,0.205575,0.163763,0.139373,0.132404,0.114983,0.101045,0.059233,0.041812,0.111498,0.439024,0.707317,0.787456,0.829268,0.843206,0.867596,0.878049,0.909408,0.229965,0.341463,0.38676,0.268293,0.188153,0.163763,0.15331,0.12892,0.108014,0.062718,0.958262,6843.905988,7141.998731,2016-02,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy
3,2016-03_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,114,806.487642,77,0.341451,0.566825,0.706045,0.812436,0.847875,0.866647,0.876922,0.888125,0.903764,0.90998,1.0391,0.7486,0.706,0.8124,0.8478,0.8666,0.8769,0.8881,0.9037,0.9099,1.0,5.0,11.0,29.0,44.0,54.0,63.0,80.0,93.0,101.0,0.157407,0.153846,0.176471,0.243902,0.28125,0.346154,0.32,0.217391,0.2,0.181818,0.035088,0.263158,0.280702,0.254386,0.219298,0.210526,0.201754,0.192982,0.087719,0.096491,0.052632,0.087719,0.403509,0.640351,0.719298,0.77193,0.780702,0.798246,0.842105,0.850877,0.307018,0.482456,0.438596,0.307018,0.263158,0.219298,0.210526,0.201754,0.131579,0.096491,0.92503,2624.209728,2836.891482,2016-03,Kenya,Lanterns,Bungoma,Sun King Pro EasyBuy
4,2016-03_Kenya_Lanterns_Kakamega_Sun King Pro E...,142,1004.572326,77,0.358523,0.615414,0.759101,0.854278,0.878097,0.892322,0.905249,0.915813,0.925324,0.932185,1.0911,0.8128,0.7591,0.8542,0.878,0.8923,0.9052,0.9158,0.9253,0.9321,1.0,6.0,12.0,27.0,40.0,52.0,68.0,73.0,88.0,108.0,0.181159,0.141732,0.168831,0.225,0.147059,0.137931,0.153846,0.130435,0.157895,0.176471,0.035211,0.112676,0.211268,0.197183,0.183099,0.169014,0.133803,0.140845,0.098592,0.098592,0.028169,0.105634,0.457746,0.71831,0.753521,0.788732,0.809859,0.830986,0.852113,0.866197,0.225352,0.359155,0.380282,0.260563,0.225352,0.197183,0.169014,0.161972,0.133803,0.112676,0.945294,3340.358794,3533.671846,2016-03,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy


In [51]:
print(df_KPI.reg_month.max())
print(df_KPI.reg_month.min())

2022-10
2016-01


In [52]:
df_features = df_KPI.copy()
# Backtesting-wise spliting features data
inputs = [30, 60, 90, 180, 270, 360, 450, 540, 630, 720]
tasks = [delayed(backtesting_split)(df_features, i) for i in inputs]
results = dask.compute(*tasks) # Compute the tasks in parallel
df_features = pd.concat(results, ignore_index=True) # Concatenate all DataFrames

In [53]:
df_features.columns

Index(['accounts_group', 'count_units', 'upfront_price_usd',
       'avg_planned_repayment_days', 'frr_3_years', 'actual_fr',
       'total_follow_on_revenue_usd', 'reg_month', 'country', 'product_group',
       'area', 'primary_product', 'backtesting', 'frr', 'repayment_speed',
       'avg_cum_days_disabled', 'at_risk_rate', 'disabled_gt_two_week_rate',
       'unlocked_rate', 'disabled_rate'],
      dtype='object')

In [54]:
df_features.shape

(855340, 20)

In [55]:
df_KPIs_final = feature_engineering(df_features)

columns to remove:  ['upfront_price_usd', 'unlock_price_usd']
Sample from data------------
                                          accounts_group  count_units  \
43425  2021-07_Tanzania_SHS with TV_Tanga_Sun King Ho...            8   

       avg_planned_repayment_days  frr_3_years    actual_fr  \
43425                         555      0.79086  5389.309856   

       total_follow_on_revenue_usd reg_month   country product_group   area  \
43425                  6814.492901   2021-07  Tanzania   SHS with TV  Tanga   

                     primary_product  backtesting       frr  repayment_speed  \
43425  Sun King Home 400 Easybuy GSM           30  0.045407            1.083   

       avg_cum_days_disabled  at_risk_rate  disabled_gt_two_week_rate  \
43425                    2.0           0.0                        0.0   

       unlocked_rate  disabled_rate  avg_upfront_price_usd  \
43425            0.0            0.0                   51.0   

       avg_unlock_price_usd  
43425        

In [56]:
(df_KPIs_final[["accounts_group", "frr_3_years"]]
 .sort_values(by="accounts_group")
 .drop_duplicates()
 .reset_index(drop=True)
 .to_csv("../data/processed/truths_1080.csv", index=False))

In [57]:
df_train, df_test, df_oot_validation = create_train_test_validation_split(df_KPIs_final)

Cut off month for training data:  2022-02
Num of rows in train subset:  509278
Num of rows in test subset:  89872
Num of rows in validation subset:  201510
Last registration month in training:  2022-02
Last registration month in validation:  2022-08


In [58]:
df_oot_validation['reg_month'].unique()

array(['2022-05', '2022-06', '2022-04', '2022-08', '2022-07', '2022-03'],
      dtype=object)

In [59]:
df_oot_validation.columns

Index(['accounts_group', 'count_units', 'avg_planned_repayment_days',
       'frr_3_years', 'actual_fr', 'total_follow_on_revenue_usd', 'reg_month',
       'country', 'product_group', 'area', 'primary_product', 'backtesting',
       'frr', 'repayment_speed', 'avg_cum_days_disabled', 'at_risk_rate',
       'disabled_gt_two_week_rate', 'unlocked_rate', 'disabled_rate',
       'avg_upfront_price_usd', 'avg_unlock_price_usd'],
      dtype='object')

In [60]:
df_oot_validation.head()

Unnamed: 0,accounts_group,count_units,avg_planned_repayment_days,frr_3_years,actual_fr,total_follow_on_revenue_usd,reg_month,country,product_group,area,primary_product,backtesting,frr,repayment_speed,avg_cum_days_disabled,at_risk_rate,disabled_gt_two_week_rate,unlocked_rate,disabled_rate,avg_upfront_price_usd,avg_unlock_price_usd
0,2022-05_Kenya_SHS Entry-Level Upgrade_Kiserian...,4,371,0.929904,582.173444,626.057576,2022-05,Kenya,SHS Entry-Level Upgrade,Kiserian,Sun King Home 120 Plus EasyBuy,30,0.066305,1.0493,1.0,0.0,0.0,0.0,0.0,5.0,162.0
1,2022-06_Kenya_Lanterns_Olkalau_Sun King Pro Ea...,4,368,0.935388,174.600736,186.661296,2022-06,Kenya,Lanterns,Olkalau,Sun King Pro EasyBuy,540,0.880822,0.8808,142.0,0.0,0.0,0.75,0.0,3.0,50.0
2,2022-04_Kenya_SHS Entry-Level_Mandera_Sun King...,131,365,0.854693,21057.981777,24638.065103,2022-04,Kenya,SHS Entry-Level,Mandera,Sun King Home 120 Plus EasyBuy,630,0.780187,0.7801,218.0,0.032787,0.183206,0.519084,0.366412,14.0,202.0
3,2022-08_Nigeria_SHS Entry-Level_Sango_Sun King...,351,203,0.994699,23294.567939,23418.710523,2022-08,Nigeria,SHS Entry-Level,Sango,Sun King Home 60 EasyBuy,30,0.170461,1.4511,0.0,0.002857,0.002849,0.002849,0.125356,9.0,76.0
4,2022-06_Kenya_Phones_Saboti_Tecno Spark 8,123,368,0.67495,18079.140655,26785.896222,2022-06,Kenya,Phones,Saboti,Tecno Spark 8,90,0.163059,0.684,16.0,0.105691,0.186992,0.0,0.463415,26.0,243.0


In [61]:
df_train.to_csv(os.path.join(data_dir,'KPIs_data_modelling_train_days.csv'), index = False)
df_test.to_csv(os.path.join(data_dir,'KPIs_data_modelling_test_days.csv'), index = False)
df_oot_validation.to_csv(os.path.join(data_dir,'KPIs_data_modelling_oot_validation_days.csv'), index = False)

In [62]:
df_features['backtesting'].value_counts()

backtesting
30     85534
60     85534
90     85534
180    85534
270    85534
360    85534
450    85534
540    85534
630    85534
720    85534
Name: count, dtype: int64

In [63]:
df_features['accounts_group'][618939]

'2020-02_Kenya_SHS Entry-Level_CBD_Sun King Home 120 Plus EasyBuy'

In [64]:
df_features[df_features['accounts_group']=='2022-03_Zambia_SHS with TV_Ndola_Sun King Home 400 Easybuy GSM']

Unnamed: 0,accounts_group,count_units,avg_planned_repayment_days,frr_3_years,actual_fr,total_follow_on_revenue_usd,reg_month,country,product_group,area,primary_product,backtesting,frr,repayment_speed,avg_cum_days_disabled,at_risk_rate,disabled_gt_two_week_rate,unlocked_rate,disabled_rate,avg_upfront_price_usd,avg_unlock_price_usd
63195,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,30,0.045127,1.281,0.0,0.0,0.0,0.0,0.0,67.0,1343.0
148729,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,60,0.078781,1.0383,4.0,0.0,0.0,0.0,0.0,67.0,1343.0
234263,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,90,0.112434,0.9649,6.0,0.0,0.0,0.0,0.5,67.0,1343.0
319797,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,180,0.218204,0.915,21.0,0.0,0.0,0.0,0.5,67.0,1343.0
405331,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,270,0.299934,0.8322,48.0,0.0,0.0,0.0,0.5,67.0,1343.0
490865,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,360,0.386473,0.8012,61.0,0.0,0.0,0.0,0.0,67.0,1343.0
576399,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,450,0.458588,0.7588,80.0,0.0,0.0,0.0,0.0,67.0,1343.0
661933,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,540,0.473011,0.6513,80.0,0.0,0.5,0.0,0.5,67.0,1343.0
747467,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,630,0.473011,0.5576,80.0,0.0,0.5,0.0,0.5,67.0,1343.0
833001,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,732,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM,720,0.473011,0.4875,80.0,0.0,0.5,0.0,0.5,67.0,1343.0


In [65]:
df_KPI[df_KPI['accounts_group']=='2022-03_Zambia_SHS with TV_Ndola_Sun King Home 400 Easybuy GSM']

Unnamed: 0,accounts_group,count_units,upfront_price_usd,avg_planned_repayment_days,frr_30,frr_60,frr_90,frr_180,frr_270,frr_360,frr_450,frr_540,frr_630,frr_720,repayment_speed_30,repayment_speed_60,repayment_speed_90,repayment_speed_180,repayment_speed_270,repayment_speed_360,repayment_speed_450,repayment_speed_540,repayment_speed_630,repayment_speed_720,avg_cum_days_disabled_30,avg_cum_days_disabled_60,avg_cum_days_disabled_90,avg_cum_days_disabled_180,avg_cum_days_disabled_270,avg_cum_days_disabled_360,avg_cum_days_disabled_450,avg_cum_days_disabled_540,avg_cum_days_disabled_630,avg_cum_days_disabled_720,at_risk_rate_30,at_risk_rate_60,at_risk_rate_90,at_risk_rate_180,at_risk_rate_270,at_risk_rate_360,at_risk_rate_450,at_risk_rate_540,at_risk_rate_630,at_risk_rate_720,disabled_gt_two_week_rate_30,disabled_gt_two_week_rate_60,disabled_gt_two_week_rate_90,disabled_gt_two_week_rate_180,disabled_gt_two_week_rate_270,disabled_gt_two_week_rate_360,disabled_gt_two_week_rate_450,disabled_gt_two_week_rate_540,disabled_gt_two_week_rate_630,disabled_gt_two_week_rate_720,unlocked_rate_30,unlocked_rate_60,unlocked_rate_90,unlocked_rate_180,unlocked_rate_270,unlocked_rate_360,unlocked_rate_450,unlocked_rate_540,unlocked_rate_630,unlocked_rate_720,disabled_rate_30,disabled_rate_60,disabled_rate_90,disabled_rate_180,disabled_rate_270,disabled_rate_360,disabled_rate_450,disabled_rate_540,disabled_rate_630,disabled_rate_720,frr_3_years,actual_fr,total_follow_on_revenue_usd,reg_month,country,product_group,area,primary_product
63195,2022-03_Zambia_SHS with TV_Ndola_Sun King Home...,2,133.848044,732,0.045127,0.078781,0.112434,0.218204,0.299934,0.386473,0.458588,0.473011,0.473011,0.473011,1.281,1.0383,0.9649,0.915,0.8322,0.8012,0.7588,0.6513,0.5576,0.4875,0.0,4.0,6.0,21.0,48.0,61.0,80.0,80.0,80.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.0,0.0,0.5,0.5,0.5,0.473011,1207.141136,2552.036076,2022-03,Zambia,SHS with TV,Ndola,Sun King Home 400 Easybuy GSM
