In [1]:
import pandas as pd
import argparse
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta
import numpy as np
import boto3
import gc
import datetime as dt
import io
from io import StringIO
import os
pd.set_option('display.max_columns',100)

In [2]:
curr_dir = os.path.abspath(os.getcwd())
data_dir = os.path.join(curr_dir,'Data')
os.makedirs(data_dir, exist_ok=True)

In [3]:
def get_training_cut_off_month(df):
    
    last_day_of_prev_month = date.today().replace(day=1) - timedelta(days=2)
    cut_off_date_3_years = last_day_of_prev_month - timedelta(days=365*3)
    
    training_cut_off_date = cut_off_date_3_years - relativedelta(months=6)
    
    training_cut_off_date = str(training_cut_off_date).split('-')
    training_cut_off_month = training_cut_off_date[:2]
    
    training_cut_off_month = '-'.join(training_cut_off_month)
    
    print('Cut off month for training data: ',training_cut_off_month)

    return training_cut_off_month, cut_off_date_3_years


def get_cohort_age_in_months(row):
    
    reg_year = int(row['reg_month'].split('-')[0])
    reg_month = int(row['reg_month'].split('-')[1])
    first_dt_of_mon = date(reg_year, reg_month, 1)
    next_month = first_dt_of_mon.replace(day=28) + timedelta(days=4)
    last_dt_of_mon = next_month - timedelta(days=next_month.day)
    # Last day of previous month for reference
    last_day_of_prev_month = date.today().replace(day=1) - timedelta(days=1)
    # Get the relativedelta between two dates
    delta = relativedelta(last_day_of_prev_month, last_dt_of_mon)
    # get months difference
    res_months = delta.months + (delta.years * 12)
    
    return res_months

def preprocessing_common(df_KPI):

    #Removing account_group duplicates 
    df_KPI.drop_duplicates(subset=['accounts_group'], keep='last', ignore_index=True)
    
    # Get cohort details
    cohort_details = df_KPI["accounts_group"].str.split("_", n = 5, expand = True)
    df_KPI["reg_month"] = cohort_details[0]
    df_KPI["country"] = cohort_details[1]
    df_KPI["product_group"] = cohort_details[2]
    df_KPI["area"] = cohort_details[3]
    df_KPI["primary_product"] = cohort_details[4]
    
    #Get age_in_months of cohorts
    df_KPI['age_in_months'] = df_KPI.apply(lambda row: get_cohort_age_in_months(row), 
                                               axis=1)
    
    df_KPI['avg_planned_repayment_days'] = df_KPI['avg_planned_repayment_days'].fillna(-1)
    df_KPI['avg_planned_repayment_days'] = df_KPI['avg_planned_repayment_days'].astype(int)
    df_KPI.loc[df_KPI['frr_3_years']>1, 'frr_3_years'] = 1
    print('Shape of cohorts having frr_3_years <=0 is {}'.format(df_KPI[df_KPI['frr_3_years']<=0].shape[0]))
    df_KPI = df_KPI.loc[df_KPI['frr_3_years']>0]
    
    # Removing records having null FRR at 30 days
    print('Shape of cohorts having frr_30 null is {}'.format(df_KPI[df_KPI['frr_30'].isna()==True].shape[0]))
    df_KPI = df_KPI.loc[df_KPI['frr_30'].notnull()].copy()
    df_KPI.reset_index(drop=True, inplace=True)

    for limit in [30,60,90,180,270,360,450,540,630,720]:
        df_KPI.loc[df_KPI['frr_'+str(limit)+'']>1, 'frr_'+str(limit)+''] = 1
        df_KPI.loc[(df_KPI['at_risk_rate_'+str(limit)+''].isnull()) &(df_KPI['frr_'+str(limit)+'']==1),'at_risk_rate_'+str(limit)+''] = 0
    
    df_KPI.reset_index(drop=True, inplace=True)
    
    return df_KPI

def preprocessing_backtesting(df_KPI, limit):
    
    #dict_backtesting_age_in_months = { 30 : 1, 60 : 2, 90 : 3, 180 : 6, 270: 9 , 360 : 12, 450: 15, 540: 18, 630: 21, 720: 24}
    backtesting_limits = [30, 60, 90, 180, 270, 360]
    limit_list = [item for item in backtesting_limits if item <= limit]
    
    #df_KPI_backtesting = df_KPI.loc[df_KPI['age_in_months']>=dict_backtesting_age_in_months[limit]]
    #Taking columns according to limit
    
    if limit==360:
        
        return df_KPI
    
    else:
        cols = ['accounts_group', 'count_units', 'upfront_price_usd','avg_planned_repayment_days','frr_3_years', 'total_follow_on_revenue_usd','reg_month','country','product_group','area',
                                                            'primary_product','age_in_months']
        col_temp = []
        for l in limit_list:

            col_list = [col for col in df_KPI.columns if ('_'+str(l) in col)]
            col_temp.extend(col_list)

        cols.extend(col_temp)
        
        df_KPI = df_KPI[cols]

        return df_KPI

def compare_KPIs_accounts(df_KPI, df_accounts):
    
    ratio_kpis_accounts_cohorts = np.round(df_KPI.shape[0]/df_accounts.shape[0],4)
    ratio_kpis_accounts_units = np.round(df_KPI['count_units'].sum()/df_accounts['num_accounts'].sum(),4)
    
    print('Percent of total cohorts present in training data: ',ratio_kpis_accounts_cohorts)
    print('Percent of total accounts present in training data: ',ratio_kpis_accounts_units)
    
    if ratio_kpis_accounts_cohorts >= 0.98 and ratio_kpis_accounts_units >= 0.99:
        return True
    else:
        return False


def split_cohorts_by_age(df_KPI):
    
    # Define Unit Age Days Limits
    list_unit_age_days_limit = [360, 450, 540, 630, 720]
    
    # Reshuffle the data
    df_KPI = df_KPI.sample(frac = 1)
    
    subset_size = {}
    subset_size[360] = int(0.1 * df_KPI.shape[0])
    subset_size[450] = int(0.1 * df_KPI.shape[0])
    subset_size[540] = int(0.1 * df_KPI.shape[0])
    subset_size[630] = int(0.1 * df_KPI.shape[0])
    subset_size[720] = int(0.6 * df_KPI.shape[0])

    # Creating subsets by Cohorts
    df_accounts_comb = pd.DataFrame()

    subset_start =0
    for unit_age_days_limit in list_unit_age_days_limit:

        subset_end = subset_start + subset_size[unit_age_days_limit]
        print('unit age days limit: ',unit_age_days_limit)
        print('subset start: {0}. subset end: {1}'.format(subset_start,subset_end ))
        print('Number of accounts in subset: ',df_KPI.iloc[subset_start:subset_end].shape[0])

        df_subset = df_KPI.iloc[subset_start:subset_end]
        df_subset['cohort_age'] = unit_age_days_limit

        if df_accounts_comb.shape[0] > 0:
            df_accounts_comb = pd.concat([df_accounts_comb, df_subset])
        else:
            df_accounts_comb = df_subset

        subset_start = subset_end
    
    # Removing the insignificant columns based on Cohort age
    for unit_age_days_limit in list_unit_age_days_limit:
        
        unit_age_limits_to_null = [limit for limit in list_unit_age_days_limit if limit > unit_age_days_limit]
        unit_age_limits_to_null = list(map(str, unit_age_limits_to_null))
    
        # Identify the columns to nullify
        cols_to_null = [col for col in df_accounts_comb.columns if any(limit in col for limit in unit_age_limits_to_null)]
    
        # Nullify the identified columns
        df_accounts_comb.loc[df_accounts_comb['cohort_age']==unit_age_days_limit, cols_to_null] = np.nan
        
    return df_accounts_comb

def create_train_test_validation_split(df_KPI):
    
    # Random shuffling
    df_KPI = df_KPI.sample(frac = 1)
    
    training_cut_off_date, cut_off_date_3_years = get_training_cut_off_month(df_KPI)
    training_cut_off_date = str(training_cut_off_date)
    cut_off_date_3_years = str(cut_off_date_3_years)
    df_train_test = df_KPI.loc[df_KPI['reg_month']<=training_cut_off_date]
    df_oot_validation = df_KPI.loc[(df_KPI['reg_month']>training_cut_off_date) & (df_KPI['reg_month']<=cut_off_date_3_years)]
    
    # Create train set using 85% rows randomly
    df_train = df_train_test.sample(frac = 0.85, random_state=100)

    # Create test set using remaining 15% rows
    df_test = df_train_test.drop(df_train.index)
    
    print('Num of rows in train subset: ',df_train.shape[0])
    print('Num of rows in test subset: ',df_test.shape[0])
    print('Num of rows in validation subset: ',df_oot_validation.shape[0])
    
    print('Last registration month in training: ',list(df_train.sort_values(by='reg_month', ascending=False)['reg_month'])[0])
    print('Last registration month in validation: ',list(df_oot_validation.sort_values(by='reg_month', ascending=False)['reg_month'])[0])
    
    return df_train, df_test, df_oot_validation

## Main flow

In [4]:
df_KPI = pd.read_csv(os.path.join(data_dir,'KPIs_data_modelling_2025-02-19.csv'))
df_accounts = pd.read_csv(os.path.join(data_dir,'accounts_data_modelling.csv'))

In [5]:
# df_KPI = merge_KPIs_target(df_merged_KPI_target)
print('Shape of the combined dataset: ',df_KPI.shape)

Shape of the combined dataset:  (62048, 77)


In [6]:
df_KPI

Unnamed: 0,accounts_group,count_units,upfront_price_usd,avg_planned_repayment_days,frr_30,frr_60,frr_90,frr_180,frr_270,frr_360,frr_450,frr_540,frr_630,frr_720,repayment_speed_30,repayment_speed_60,repayment_speed_90,repayment_speed_180,repayment_speed_270,repayment_speed_360,repayment_speed_450,repayment_speed_540,repayment_speed_630,repayment_speed_720,avg_cum_days_disabled_30,avg_cum_days_disabled_60,avg_cum_days_disabled_90,avg_cum_days_disabled_180,avg_cum_days_disabled_270,avg_cum_days_disabled_360,avg_cum_days_disabled_450,avg_cum_days_disabled_540,avg_cum_days_disabled_630,avg_cum_days_disabled_720,at_risk_rate_30,at_risk_rate_60,at_risk_rate_90,at_risk_rate_180,at_risk_rate_270,at_risk_rate_360,at_risk_rate_450,at_risk_rate_540,at_risk_rate_630,at_risk_rate_720,disabled_gt_two_week_rate_30,disabled_gt_two_week_rate_60,disabled_gt_two_week_rate_90,disabled_gt_two_week_rate_180,disabled_gt_two_week_rate_270,disabled_gt_two_week_rate_360,disabled_gt_two_week_rate_450,disabled_gt_two_week_rate_540,disabled_gt_two_week_rate_630,disabled_gt_two_week_rate_720,unlocked_rate_30,unlocked_rate_60,unlocked_rate_90,unlocked_rate_180,unlocked_rate_270,unlocked_rate_360,unlocked_rate_450,unlocked_rate_540,unlocked_rate_630,unlocked_rate_720,disabled_rate_30,disabled_rate_60,disabled_rate_90,disabled_rate_180,disabled_rate_270,disabled_rate_360,disabled_rate_450,disabled_rate_540,disabled_rate_630,disabled_rate_720,frr_3_years,actual_fr,total_follow_on_revenue_usd
0,2016-01_Kenya_Lanterns_Kakamega_Sun King Pro E...,57,403.243821,77.0,0.352180,0.626566,0.785424,0.888827,0.924767,0.935544,0.945594,0.963138,0.964892,0.971333,1.0718,0.8275,0.7854,0.8888,0.9247,0.9355,0.9455,0.9631,0.9648,0.9713,1.0,5.0,11.0,25.0,33.0,42.0,43.0,51.0,57.0,69.0,0.181818,0.080000,0.148148,0.125000,0.222222,0.250000,0.333333,0.166667,0.200000,0.333333,0.017544,0.122807,0.175439,0.157895,0.157895,0.105263,0.105263,0.087719,0.087719,0.052632,0.035088,0.122807,0.526316,0.719298,0.842105,0.859649,0.894737,0.894737,0.912281,0.947368,0.263158,0.315789,0.368421,0.228070,0.157895,0.122807,0.105263,0.105263,0.087719,0.052632,0.977148,1386.031419,1418.445741
1,2016-02_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,18,127.340154,77.0,0.425714,0.686587,0.807381,0.849048,0.856984,0.856984,0.856984,0.856984,0.892698,0.903810,1.2956,0.9068,0.8073,0.8490,0.8569,0.8569,0.8569,0.8569,0.8926,0.9038,0.0,3.0,8.0,14.0,14.0,14.0,14.0,14.0,102.0,105.0,0.055556,0.062500,0.090909,0.166667,0.166667,0.166667,0.166667,0.166667,0.250000,1.000000,0.000000,0.111111,0.277778,0.333333,0.333333,0.333333,0.333333,0.333333,0.166667,0.055556,0.000000,0.111111,0.388889,0.666667,0.666667,0.666667,0.666667,0.666667,0.777778,0.833333,0.277778,0.388889,0.444444,0.333333,0.333333,0.333333,0.333333,0.333333,0.222222,0.055556,0.903810,404.843825,447.930234
2,2016-02_Kenya_Lanterns_Kakamega_Sun King Pro E...,287,2030.368011,77.0,0.365668,0.618819,0.763085,0.867551,0.897489,0.917509,0.924428,0.933954,0.938175,0.951517,1.1129,0.8173,0.7630,0.8675,0.8974,0.9175,0.9244,0.9339,0.9381,0.9515,1.0,7.0,13.0,32.0,43.0,55.0,63.0,72.0,80.0,90.0,0.156364,0.113725,0.130435,0.166667,0.196721,0.224490,0.244444,0.263158,0.272727,0.200000,0.045296,0.087108,0.198606,0.205575,0.163763,0.139373,0.132404,0.114983,0.101045,0.059233,0.041812,0.111498,0.439024,0.707317,0.787456,0.829268,0.843206,0.867596,0.878049,0.909408,0.229965,0.341463,0.386760,0.268293,0.188153,0.163763,0.153310,0.128920,0.108014,0.062718,0.958262,6843.905988,7141.998731
3,2016-03_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,114,806.487642,77.0,0.341451,0.566825,0.706045,0.812436,0.847875,0.866647,0.876922,0.888125,0.903764,0.909980,1.0391,0.7486,0.7060,0.8124,0.8478,0.8666,0.8769,0.8881,0.9037,0.9099,1.0,5.0,11.0,29.0,44.0,54.0,63.0,80.0,93.0,101.0,0.157407,0.153846,0.176471,0.243902,0.281250,0.346154,0.320000,0.217391,0.200000,0.181818,0.035088,0.263158,0.280702,0.254386,0.219298,0.210526,0.201754,0.192982,0.087719,0.096491,0.052632,0.087719,0.403509,0.640351,0.719298,0.771930,0.780702,0.798246,0.842105,0.850877,0.307018,0.482456,0.438596,0.307018,0.263158,0.219298,0.210526,0.201754,0.131579,0.096491,0.925030,2624.209728,2836.891482
4,2016-03_Kenya_Lanterns_Kakamega_Sun King Pro E...,142,1004.572326,77.0,0.358523,0.615414,0.759101,0.854278,0.878097,0.892322,0.905249,0.915813,0.925324,0.932185,1.0911,0.8128,0.7591,0.8542,0.8780,0.8923,0.9052,0.9158,0.9253,0.9321,1.0,6.0,12.0,27.0,40.0,52.0,68.0,73.0,88.0,108.0,0.181159,0.141732,0.168831,0.225000,0.147059,0.137931,0.153846,0.130435,0.157895,0.176471,0.035211,0.112676,0.211268,0.197183,0.183099,0.169014,0.133803,0.140845,0.098592,0.098592,0.028169,0.105634,0.457746,0.718310,0.753521,0.788732,0.809859,0.830986,0.852113,0.866197,0.225352,0.359155,0.380282,0.260563,0.225352,0.197183,0.169014,0.161972,0.133803,0.112676,0.945294,3340.358794,3533.671846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62043,2022-03_Zambia_SHS Entry-Level_Kitwe_Sun King ...,20,267.696080,382.0,0.067593,0.117694,0.169428,0.304983,0.432340,0.557559,0.650152,0.693687,0.714074,0.718636,0.9651,0.7802,0.7313,0.6433,0.6034,0.5814,0.6501,0.6936,0.7140,0.7186,0.0,5.0,13.0,41.0,61.0,81.0,98.0,127.0,136.0,137.0,0.200000,0.100000,0.050000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.150000,0.100000,0.150000,0.150000,0.150000,0.150000,0.150000,0.200000,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.300000,0.400000,0.500000,0.550000,0.250000,0.400000,0.300000,0.300000,0.350000,0.300000,0.250000,0.250000,0.250000,0.200000,0.754581,2374.743498,3147.102164
62044,2022-03_Zambia_SHS Entry-Level_Ndola_Sun King ...,13,217.503065,382.0,0.072299,0.138023,0.199496,0.359237,0.501425,0.636675,0.734528,0.793897,0.813007,0.821554,1.0844,0.9612,0.9046,0.7960,0.7351,0.6974,0.7345,0.7938,0.8130,0.8215,1.0,4.0,13.0,31.0,55.0,78.0,101.0,134.0,142.0,144.0,0.076923,0.076923,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.076923,0.076923,0.000000,0.076923,0.153846,0.307692,0.384615,0.153846,0.307692,0.307692,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.307692,0.538462,0.615385,0.615385,0.153846,0.076923,0.230769,0.307692,0.384615,0.384615,0.461538,0.307692,0.307692,0.384615,0.901852,2295.020318,2544.785971
62045,2022-03_Zambia_SHS Entry-Level_Ndola_Sun King ...,24,321.235296,382.0,0.076291,0.132562,0.192270,0.360171,0.491975,0.643764,0.731657,0.780135,0.806439,0.828255,1.0893,0.8788,0.8300,0.7597,0.6866,0.6713,0.7316,0.7801,0.8064,0.8282,1.0,6.0,15.0,34.0,54.0,72.0,95.0,110.0,125.0,133.0,0.083333,0.041667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.125000,0.166667,0.208333,0.208333,0.208333,0.208333,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000,0.083333,0.416667,0.500000,0.541667,0.625000,0.166667,0.250000,0.291667,0.375000,0.291667,0.333333,0.291667,0.333333,0.333333,0.291667,0.873064,3036.844930,3478.376076
62046,2022-03_Zambia_SHS with TV_Kitwe_Sun King Home...,5,334.620110,732.0,0.040647,0.069493,0.096416,0.188724,0.275262,0.347080,0.458531,0.512421,0.560498,0.608837,1.1538,0.9158,0.8274,0.7914,0.7637,0.7195,0.7588,0.7055,0.6608,0.6275,1.0,5.0,11.0,29.0,52.0,86.0,118.0,151.0,161.0,238.0,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,0.200000,0.000000,0.000000,0.200000,0.400000,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.200000,0.200000,0.600000,0.400000,0.600000,0.600000,0.400000,0.800000,0.600000,0.775350,4946.802929,6380.090190


In [None]:
df_KPI = preprocessing_common(df_KPI)

Shape of cohorts having frr_3_years <=0 is 154
Shape of cohorts having frr_30 null is 0


In [8]:
df_KPI

Unnamed: 0,accounts_group,count_units,upfront_price_usd,avg_planned_repayment_days,frr_30,frr_60,frr_90,frr_180,frr_270,frr_360,frr_450,frr_540,frr_630,frr_720,repayment_speed_30,repayment_speed_60,repayment_speed_90,repayment_speed_180,repayment_speed_270,repayment_speed_360,repayment_speed_450,repayment_speed_540,repayment_speed_630,repayment_speed_720,avg_cum_days_disabled_30,avg_cum_days_disabled_60,avg_cum_days_disabled_90,avg_cum_days_disabled_180,avg_cum_days_disabled_270,avg_cum_days_disabled_360,avg_cum_days_disabled_450,avg_cum_days_disabled_540,avg_cum_days_disabled_630,avg_cum_days_disabled_720,at_risk_rate_30,at_risk_rate_60,at_risk_rate_90,at_risk_rate_180,at_risk_rate_270,at_risk_rate_360,at_risk_rate_450,at_risk_rate_540,at_risk_rate_630,at_risk_rate_720,disabled_gt_two_week_rate_30,disabled_gt_two_week_rate_60,disabled_gt_two_week_rate_90,disabled_gt_two_week_rate_180,disabled_gt_two_week_rate_270,disabled_gt_two_week_rate_360,disabled_gt_two_week_rate_450,disabled_gt_two_week_rate_540,disabled_gt_two_week_rate_630,disabled_gt_two_week_rate_720,unlocked_rate_30,unlocked_rate_60,unlocked_rate_90,unlocked_rate_180,unlocked_rate_270,unlocked_rate_360,unlocked_rate_450,unlocked_rate_540,unlocked_rate_630,unlocked_rate_720,disabled_rate_30,disabled_rate_60,disabled_rate_90,disabled_rate_180,disabled_rate_270,disabled_rate_360,disabled_rate_450,disabled_rate_540,disabled_rate_630,disabled_rate_720,frr_3_years,actual_fr,total_follow_on_revenue_usd,reg_month,country,product_group,area,primary_product,age_in_months
0,2016-01_Kenya_Lanterns_Kakamega_Sun King Pro E...,57,403.243821,77,0.352180,0.626566,0.785424,0.888827,0.924767,0.935544,0.945594,0.963138,0.964892,0.971333,1.0718,0.8275,0.7854,0.8888,0.9247,0.9355,0.9455,0.9631,0.9648,0.9713,1.0,5.0,11.0,25.0,33.0,42.0,43.0,51.0,57.0,69.0,0.181818,0.080000,0.148148,0.125000,0.222222,0.250000,0.333333,0.166667,0.200000,0.333333,0.017544,0.122807,0.175439,0.157895,0.157895,0.105263,0.105263,0.087719,0.087719,0.052632,0.035088,0.122807,0.526316,0.719298,0.842105,0.859649,0.894737,0.894737,0.912281,0.947368,0.263158,0.315789,0.368421,0.228070,0.157895,0.122807,0.105263,0.105263,0.087719,0.052632,0.977148,1386.031419,1418.445741,2016-01,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy,108
1,2016-02_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,18,127.340154,77,0.425714,0.686587,0.807381,0.849048,0.856984,0.856984,0.856984,0.856984,0.892698,0.903810,1.2956,0.9068,0.8073,0.8490,0.8569,0.8569,0.8569,0.8569,0.8926,0.9038,0.0,3.0,8.0,14.0,14.0,14.0,14.0,14.0,102.0,105.0,0.055556,0.062500,0.090909,0.166667,0.166667,0.166667,0.166667,0.166667,0.250000,1.000000,0.000000,0.111111,0.277778,0.333333,0.333333,0.333333,0.333333,0.333333,0.166667,0.055556,0.000000,0.111111,0.388889,0.666667,0.666667,0.666667,0.666667,0.666667,0.777778,0.833333,0.277778,0.388889,0.444444,0.333333,0.333333,0.333333,0.333333,0.333333,0.222222,0.055556,0.903810,404.843825,447.930234,2016-02,Kenya,Lanterns,Bungoma,Sun King Pro EasyBuy,107
2,2016-02_Kenya_Lanterns_Kakamega_Sun King Pro E...,287,2030.368011,77,0.365668,0.618819,0.763085,0.867551,0.897489,0.917509,0.924428,0.933954,0.938175,0.951517,1.1129,0.8173,0.7630,0.8675,0.8974,0.9175,0.9244,0.9339,0.9381,0.9515,1.0,7.0,13.0,32.0,43.0,55.0,63.0,72.0,80.0,90.0,0.156364,0.113725,0.130435,0.166667,0.196721,0.224490,0.244444,0.263158,0.272727,0.200000,0.045296,0.087108,0.198606,0.205575,0.163763,0.139373,0.132404,0.114983,0.101045,0.059233,0.041812,0.111498,0.439024,0.707317,0.787456,0.829268,0.843206,0.867596,0.878049,0.909408,0.229965,0.341463,0.386760,0.268293,0.188153,0.163763,0.153310,0.128920,0.108014,0.062718,0.958262,6843.905988,7141.998731,2016-02,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy,107
3,2016-03_Kenya_Lanterns_Bungoma_Sun King Pro Ea...,114,806.487642,77,0.341451,0.566825,0.706045,0.812436,0.847875,0.866647,0.876922,0.888125,0.903764,0.909980,1.0391,0.7486,0.7060,0.8124,0.8478,0.8666,0.8769,0.8881,0.9037,0.9099,1.0,5.0,11.0,29.0,44.0,54.0,63.0,80.0,93.0,101.0,0.157407,0.153846,0.176471,0.243902,0.281250,0.346154,0.320000,0.217391,0.200000,0.181818,0.035088,0.263158,0.280702,0.254386,0.219298,0.210526,0.201754,0.192982,0.087719,0.096491,0.052632,0.087719,0.403509,0.640351,0.719298,0.771930,0.780702,0.798246,0.842105,0.850877,0.307018,0.482456,0.438596,0.307018,0.263158,0.219298,0.210526,0.201754,0.131579,0.096491,0.925030,2624.209728,2836.891482,2016-03,Kenya,Lanterns,Bungoma,Sun King Pro EasyBuy,106
4,2016-03_Kenya_Lanterns_Kakamega_Sun King Pro E...,142,1004.572326,77,0.358523,0.615414,0.759101,0.854278,0.878097,0.892322,0.905249,0.915813,0.925324,0.932185,1.0911,0.8128,0.7591,0.8542,0.8780,0.8923,0.9052,0.9158,0.9253,0.9321,1.0,6.0,12.0,27.0,40.0,52.0,68.0,73.0,88.0,108.0,0.181159,0.141732,0.168831,0.225000,0.147059,0.137931,0.153846,0.130435,0.157895,0.176471,0.035211,0.112676,0.211268,0.197183,0.183099,0.169014,0.133803,0.140845,0.098592,0.098592,0.028169,0.105634,0.457746,0.718310,0.753521,0.788732,0.809859,0.830986,0.852113,0.866197,0.225352,0.359155,0.380282,0.260563,0.225352,0.197183,0.169014,0.161972,0.133803,0.112676,0.945294,3340.358794,3533.671846,2016-03,Kenya,Lanterns,Kakamega,Sun King Pro EasyBuy,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61889,2022-03_Zambia_SHS Entry-Level_Kitwe_Sun King ...,20,267.696080,382,0.067593,0.117694,0.169428,0.304983,0.432340,0.557559,0.650152,0.693687,0.714074,0.718636,0.9651,0.7802,0.7313,0.6433,0.6034,0.5814,0.6501,0.6936,0.7140,0.7186,0.0,5.0,13.0,41.0,61.0,81.0,98.0,127.0,136.0,137.0,0.200000,0.100000,0.050000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.150000,0.100000,0.150000,0.150000,0.150000,0.150000,0.150000,0.200000,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.300000,0.400000,0.500000,0.550000,0.250000,0.400000,0.300000,0.300000,0.350000,0.300000,0.250000,0.250000,0.250000,0.200000,0.754581,2374.743498,3147.102164,2022-03,Zambia,SHS Entry-Level,Kitwe,Sun King Home 60 EasyBuy,34
61890,2022-03_Zambia_SHS Entry-Level_Ndola_Sun King ...,13,217.503065,382,0.072299,0.138023,0.199496,0.359237,0.501425,0.636675,0.734528,0.793897,0.813007,0.821554,1.0844,0.9612,0.9046,0.7960,0.7351,0.6974,0.7345,0.7938,0.8130,0.8215,1.0,4.0,13.0,31.0,55.0,78.0,101.0,134.0,142.0,144.0,0.076923,0.076923,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.076923,0.076923,0.000000,0.076923,0.153846,0.307692,0.384615,0.153846,0.307692,0.307692,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.307692,0.538462,0.615385,0.615385,0.153846,0.076923,0.230769,0.307692,0.384615,0.384615,0.461538,0.307692,0.307692,0.384615,0.901852,2295.020318,2544.785971,2022-03,Zambia,SHS Entry-Level,Ndola,Sun King Home 120 EasyBuy,34
61891,2022-03_Zambia_SHS Entry-Level_Ndola_Sun King ...,24,321.235296,382,0.076291,0.132562,0.192270,0.360171,0.491975,0.643764,0.731657,0.780135,0.806439,0.828255,1.0893,0.8788,0.8300,0.7597,0.6866,0.6713,0.7316,0.7801,0.8064,0.8282,1.0,6.0,15.0,34.0,54.0,72.0,95.0,110.0,125.0,133.0,0.083333,0.041667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.125000,0.166667,0.208333,0.208333,0.208333,0.208333,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000,0.083333,0.416667,0.500000,0.541667,0.625000,0.166667,0.250000,0.291667,0.375000,0.291667,0.333333,0.291667,0.333333,0.333333,0.291667,0.873064,3036.844930,3478.376076,2022-03,Zambia,SHS Entry-Level,Ndola,Sun King Home 60 EasyBuy,34
61892,2022-03_Zambia_SHS with TV_Kitwe_Sun King Home...,5,334.620110,732,0.040647,0.069493,0.096416,0.188724,0.275262,0.347080,0.458531,0.512421,0.560498,0.608837,1.1538,0.9158,0.8274,0.7914,0.7637,0.7195,0.7588,0.7055,0.6608,0.6275,1.0,5.0,11.0,29.0,52.0,86.0,118.0,151.0,161.0,238.0,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,0.200000,0.000000,0.000000,0.200000,0.400000,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.200000,0.200000,0.600000,0.400000,0.600000,0.600000,0.400000,0.800000,0.600000,0.775350,4946.802929,6380.090190,2022-03,Zambia,SHS with TV,Kitwe,Sun King Home 400 Easybuy GSM,34


In [9]:
# Random shuffling
df_KPI = df_KPI.sample(frac = 1)
training_cut_off_month = str(get_training_cut_off_month(df_KPI))

Cut off month for training data:  2021-07


In [11]:
for btl in [30, 60, 90, 180, 270, 360]:
    
    df = preprocessing_backtesting(df_KPI, btl)
    
    if(compare_KPIs_accounts(df, df_accounts)):
        
        print('Sufficient number of cohorts and units in KPIs data. Continuing with preprocessing.')
        print("Splitting data between train, test and validation...")
        if btl == 360:
            
            df_accounts_comb = split_cohorts_by_age(df_KPI)
            df_train, df_test, df_oot_validation = create_train_test_validation_split(df_accounts_comb)
        
        else:
            
            df_train, df_test, df_oot_validation = create_train_test_validation_split(df)
        
        print("Saving preprocessed data")
        df_train.to_csv(os.path.join(data_dir,'KPIs_data_modelling_train_'+str(btl)+'_days.csv'), index = False)
        df_test.to_csv(os.path.join(data_dir,'KPIs_data_modelling_test_'+str(btl)+'_days.csv'), index = False)
        df_oot_validation.to_csv(os.path.join(data_dir,'KPIs_data_modelling_oot_validation_'+str(btl)+'_days.csv'), index = False)
        
        print("Processing completed")
    
    else: 
        raise Exception("Exception: Either number of cohorts or number of units not sufficient in KPIs data")

Percent of total cohorts present in training data:  1.0275
Percent of total accounts present in training data:  1.0095
Sufficient number of cohorts and units in KPIs data. Continuing with preprocessing.
Splitting data between train, test and validation...
Cut off month for training data:  2021-07
Num of rows in train subset:  36906
Num of rows in test subset:  6513
Num of rows in validation subset:  13847
Last registration month in training:  2021-07
Last registration month in validation:  2022-01
Saving preprocessed data
Processing completed
Percent of total cohorts present in training data:  1.0275
Percent of total accounts present in training data:  1.0095
Sufficient number of cohorts and units in KPIs data. Continuing with preprocessing.
Splitting data between train, test and validation...
Cut off month for training data:  2021-07
Num of rows in train subset:  36906
Num of rows in test subset:  6513
Num of rows in validation subset:  13847
Last registration month in training:  2021

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['cohort_age'] = unit_age_days_limit
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['cohort_age'] = unit_age_days_limit
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['cohort_age'] = unit_age_days_limit
A value is trying to be set on a copy of a slice from a DataFrame.


Num of rows in train subset:  36904
Num of rows in test subset:  6513
Num of rows in validation subset:  13847
Last registration month in training:  2021-07
Last registration month in validation:  2022-01
Saving preprocessed data
Processing completed
