In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data2.csv")
df.columns = [i.replace(" ", "_").lower() for i in df.columns]
df.head()

Unnamed: 0,date_uploaded_date,country,product_group,registration_date_month,predicted_frr_updated,count
0,2025-09-01,Kenya,SHS Entry-Level,2025-02,0.860431,6241
1,2025-09-01,Kenya,SHS Entry-Level,2022-08,0.845198,6377
2,2025-09-01,Kenya,SHS with TV Upgrade,2021-08,0.741954,1250
3,2025-09-01,Uganda,SHS Entry-Level,2022-05,0.717826,1012
4,2025-09-01,Kenya,Lanterns,2025-03,0.87641,4931


In [3]:
df.date_uploaded_date.unique()

array(['2025-09-01', '2025-08-01', '2025-07-01', '2025-06-01',
       '2025-05-01', '2025-04-01', '2025-03-04', '2025-02-01',
       '2025-01-02', '2024-12-01', '2024-11-01', '2024-10-01',
       '2024-09-02', '2024-08-01', '2024-07-01', '2024-06-01',
       '2024-05-02', '2024-04-09', '2024-03-01', '2024-02-05',
       '2024-01-02', '2023-12-05', '2023-11-01', '2023-10-03',
       '2023-09-05', '2023-08-25', '2023-08-02', '2023-07-31'],
      dtype=object)

In [4]:
pivoted = df.pivot_table(
    index=["country", "product_group","registration_date_month"],
    columns="date_uploaded_date",
    values=["predicted_frr_updated", "count"]
).reset_index()

pivoted.columns = [
    "_".join([str(c) for c in col if c])  # join levels with underscore
    for col in pivoted.columns.values
]
pivoted["registration_date_month"] = (
    pd.to_datetime(pivoted["registration_date_month"], format="%Y-%m")
      .dt.to_period("M")
)

In [5]:
pivoted

Unnamed: 0,country,product_group,registration_date_month,count_2023-07-31,count_2023-08-02,count_2023-08-25,count_2023-09-05,count_2023-10-03,count_2023-11-01,count_2023-12-05,...,predicted_frr_updated_2024-12-01,predicted_frr_updated_2025-01-02,predicted_frr_updated_2025-02-01,predicted_frr_updated_2025-03-04,predicted_frr_updated_2025-04-01,predicted_frr_updated_2025-05-01,predicted_frr_updated_2025-06-01,predicted_frr_updated_2025-07-01,predicted_frr_updated_2025-08-01,predicted_frr_updated_2025-09-01
0,Kenya,Lanterns,2021-06,3650.0,3650.0,,3650.0,3650.0,3650.0,3650.0,...,0.917024,0.917095,0.917100,0.923682,0.923682,0.923681,0.923688,0.923694,0.921999,0.922010
1,Kenya,Lanterns,2021-07,3767.0,3767.0,,3767.0,3767.0,3767.0,3767.0,...,0.917091,0.917132,0.917132,0.923794,0.923792,0.923758,0.923758,0.923757,0.922243,0.922247
2,Kenya,Lanterns,2021-08,3715.0,3715.0,,3715.0,3715.0,3715.0,3715.0,...,0.915239,0.915242,0.915242,0.920996,0.921244,0.921187,0.921184,0.921187,0.919908,0.919909
3,Kenya,Lanterns,2021-09,3563.0,3563.0,,3563.0,3563.0,3563.0,3563.0,...,0.915242,0.915316,0.915312,0.920970,0.920973,0.920939,0.920937,0.920938,0.919863,0.919856
4,Kenya,Lanterns,2021-10,3709.0,3709.0,,3709.0,3709.0,3709.0,3709.0,...,0.918335,0.918428,0.918434,0.923380,0.923379,0.923303,0.923303,0.923296,0.922101,0.922100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622,Zambia,SHS without TV,2025-03,,,,,,,,...,,,,,,0.976674,0.979620,0.985045,0.985019,0.987361
1623,Zambia,SHS without TV,2025-04,,,,,,,,...,,,,,,,0.978405,0.985699,0.983102,0.986009
1624,Zambia,SHS without TV,2025-05,,,,,,,,...,,,,,,,,0.986601,0.986054,0.984654
1625,Zambia,SHS without TV,2025-06,,,,,,,,...,,,,,,,,,0.971452,0.979647


## Establish Backtesting Checkpoints

In [6]:
backtestings = np.array([30, 60, 90, 180, 270, 360, 450, 540, 630, 720], dtype=int)
months = backtestings // 30
current_month = pd.Timestamp.today().to_period("M")

In [7]:
# vectorized subtraction
backtesting_month_current = current_month - (months + 1)
backtesting_month_prev = current_month - (months + 2)

print(backtesting_month_current)

data = {
    "Months": months,
    "backtesting_month_current": backtesting_month_current,
    "backtesting_month_prev": backtesting_month_prev,
    "backtesting_shift": backtestings
}
backtestings_df = pd.DataFrame(data)

[Period('2025-07', 'M') Period('2025-06', 'M') Period('2025-05', 'M')
 Period('2025-02', 'M') Period('2024-11', 'M') Period('2024-08', 'M')
 Period('2024-05', 'M') Period('2024-02', 'M') Period('2023-11', 'M')
 Period('2023-08', 'M')]


In [8]:
backtestings_df

Unnamed: 0,Months,backtesting_month_current,backtesting_month_prev,backtesting_shift
0,1,2025-07,2025-06,30
1,2,2025-06,2025-05,60
2,3,2025-05,2025-04,90
3,6,2025-02,2025-01,180
4,9,2024-11,2024-10,270
5,12,2024-08,2024-07,360
6,15,2024-05,2024-04,450
7,18,2024-02,2024-01,540
8,21,2023-11,2023-10,630
9,24,2023-08,2023-07,720


In [9]:
discrepancies = pd.merge(left=pivoted, right=backtestings_df, left_on="registration_date_month", right_on="backtesting_month_current", how="left")
discrepancies["backtesting_shift"] = discrepancies["backtesting_shift"].astype("Int64")

In [10]:
discrepancies

Unnamed: 0,country,product_group,registration_date_month,count_2023-07-31,count_2023-08-02,count_2023-08-25,count_2023-09-05,count_2023-10-03,count_2023-11-01,count_2023-12-05,...,predicted_frr_updated_2025-04-01,predicted_frr_updated_2025-05-01,predicted_frr_updated_2025-06-01,predicted_frr_updated_2025-07-01,predicted_frr_updated_2025-08-01,predicted_frr_updated_2025-09-01,Months,backtesting_month_current,backtesting_month_prev,backtesting_shift
0,Kenya,Lanterns,2021-06,3650.0,3650.0,,3650.0,3650.0,3650.0,3650.0,...,0.923682,0.923681,0.923688,0.923694,0.921999,0.922010,,NaT,NaT,
1,Kenya,Lanterns,2021-07,3767.0,3767.0,,3767.0,3767.0,3767.0,3767.0,...,0.923792,0.923758,0.923758,0.923757,0.922243,0.922247,,NaT,NaT,
2,Kenya,Lanterns,2021-08,3715.0,3715.0,,3715.0,3715.0,3715.0,3715.0,...,0.921244,0.921187,0.921184,0.921187,0.919908,0.919909,,NaT,NaT,
3,Kenya,Lanterns,2021-09,3563.0,3563.0,,3563.0,3563.0,3563.0,3563.0,...,0.920973,0.920939,0.920937,0.920938,0.919863,0.919856,,NaT,NaT,
4,Kenya,Lanterns,2021-10,3709.0,3709.0,,3709.0,3709.0,3709.0,3709.0,...,0.923379,0.923303,0.923303,0.923296,0.922101,0.922100,,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622,Zambia,SHS without TV,2025-03,,,,,,,,...,,0.976674,0.979620,0.985045,0.985019,0.987361,,NaT,NaT,
1623,Zambia,SHS without TV,2025-04,,,,,,,,...,,,0.978405,0.985699,0.983102,0.986009,,NaT,NaT,
1624,Zambia,SHS without TV,2025-05,,,,,,,,...,,,,0.986601,0.986054,0.984654,3.0,2025-05,2025-04,90
1625,Zambia,SHS without TV,2025-06,,,,,,,,...,,,,,0.971452,0.979647,2.0,2025-06,2025-05,60


In [11]:
discrepancies["diff"] = (discrepancies["predicted_frr_updated_2025-09-01"] - discrepancies["predicted_frr_updated_2025-08-01"]).round(4)*100

# backtesting_reasons = {
#   30: "Due to backtesting shift to 30",
#   60: "Due to backtesting shift from 30 to 60",
#   90: "Due to backtesting shift from 60 to 90",
#   180: "Due to backtesting shift from 90 to 180",
#   270: "Due to backtesting shift from 180 to 270",
#   360: "Due to backtesting shift from 270 to 360",
#   450: "Due to backtesting shift from 360 to 450",
#   540: "Due to backtesting shift from 450 to 540",
#   630: "Due to backtesting shift from 540 to 630",
#   720: "Due to backtesting shift from 630 to 720"
# }
# discrepancies = discrepancies.copy()
# discrepancies["reason"] = (
#     discrepancies["backtesting_shift"].map(backtesting_reasons)
#     .where(discrepancies["diff"].abs() > 0.75)
# )
# discrepancies.head()

In [12]:
discrepancies_df = discrepancies[discrepancies.backtesting_shift.isin([360, 450, 540, 630, 720])]


In [13]:
discrepancies_df[(discrepancies_df.product_group=="Lanterns") & (discrepancies_df.country=="Kenya")]

Unnamed: 0,country,product_group,registration_date_month,count_2023-07-31,count_2023-08-02,count_2023-08-25,count_2023-09-05,count_2023-10-03,count_2023-11-01,count_2023-12-05,...,predicted_frr_updated_2025-05-01,predicted_frr_updated_2025-06-01,predicted_frr_updated_2025-07-01,predicted_frr_updated_2025-08-01,predicted_frr_updated_2025-09-01,Months,backtesting_month_current,backtesting_month_prev,backtesting_shift,diff
26,Kenya,Lanterns,2023-08,,,,,3889.0,3889.0,3889.0,...,0.89783,0.897294,0.8973,0.898212,0.895809,24.0,2023-08,2023-07,720,-0.24
29,Kenya,Lanterns,2023-11,,,,,,,,...,0.887175,0.877889,0.877889,0.878588,0.878636,21.0,2023-11,2023-10,630,0.0
32,Kenya,Lanterns,2024-02,,,,,,,,...,0.894796,0.880007,0.880007,0.880007,0.866933,18.0,2024-02,2024-01,540,-1.31
35,Kenya,Lanterns,2024-05,,,,,,,,...,0.88008,0.884859,0.884857,0.884857,0.869871,15.0,2024-05,2024-04,450,-1.5
38,Kenya,Lanterns,2024-08,,,,,,,,...,0.88277,0.878815,0.878416,0.877977,0.885024,12.0,2024-08,2024-07,360,0.7


In [14]:
discrepancies_summary = (
    discrepancies_df
    .groupby(["country", "product_group"])
    .agg(
        total_records=("diff", "size"),
        count_average=("count_2025-09-01", lambda x: int(x.mean())),
        count_median=("count_2025-09-01", lambda x: int(x.median())),
        mean_diff=("diff", lambda x: x.mean().round(2)),
        discrepancies_75=("diff", lambda x: (x.abs() > 0.75).sum())
    )
    .reset_index()
)

discrepancies_summary

Unnamed: 0,country,product_group,total_records,count_average,count_median,mean_diff,discrepancies_75
0,Kenya,Lanterns,5,4682,4665,-0.47,2
1,Kenya,Phones,5,3603,2830,-0.21,1
2,Kenya,SHS Entry-Level,5,4724,4637,-0.38,3
3,Kenya,SHS Entry-Level Upgrade,5,531,434,-0.25,0
4,Kenya,SHS with TV,5,728,665,-0.74,3
5,Kenya,SHS with TV Upgrade,5,498,522,-0.11,1
6,Kenya,SHS without TV,5,1939,1844,-0.86,4
7,Malawi,Lanterns,3,174,119,0.0,0
8,Malawi,SHS Entry-Level,5,263,239,-0.08,0
9,Malawi,SHS with TV,5,25,25,-0.24,2
