In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

Load DataFrames and Preprocess

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/chencba/ContracetivesConsumptionPrediction/main/Train.csv")
product = pd.read_csv("https://raw.githubusercontent.com/chencba/ContracetivesConsumptionPrediction/main/product.csv")
df = df.merge(product, on = "product_code")

site = pd.read_csv("https://raw.githubusercontent.com/chencba/ContracetivesConsumptionPrediction/main/service_delivery_site_data.csv")
df = df.merge(site, on = "site_code")

df.drop(["site_region", "site_district"], axis = 1, inplace = True)

df.drop(["stock_stockout_days", "stock_adjustment", "site_latitude", "site_longitude", "product_name", "stock_ordered", "stock_end", "stock_initial", "stock_received"], axis = 1, inplace = True)

df[:5]

Unnamed: 0,year,month,region,district,site_code,product_code,stock_distributed,average_monthly_consumption,product_type,site_type
0,2019,1,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,21,18,Injectable Contraceptive,Hospital
1,2019,2,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,0,18,Injectable Contraceptive,Hospital
2,2019,3,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,0,7,Injectable Contraceptive,Hospital
3,2019,4,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,2,1,Injectable Contraceptive,Hospital
4,2019,5,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,31,11,Injectable Contraceptive,Hospital


In [4]:
df["year"] = df["year"].astype(str)
df["month"] = df["month"].astype(str)

df["year_month"] = df["year"] + df["month"]
df["year_month"] = df["year_month"].apply(lambda x: datetime.datetime.strptime(x, "%Y%m"))

df["site_product"] = df["site_code"] + " X " + df["product_code"]

df[:5]

Unnamed: 0,year,month,region,district,site_code,product_code,stock_distributed,average_monthly_consumption,product_type,site_type,year_month,site_product
0,2019,1,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,21,18,Injectable Contraceptive,Hospital,2019-01-01,C4001 X AS27134
1,2019,2,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,0,18,Injectable Contraceptive,Hospital,2019-02-01,C4001 X AS27134
2,2019,3,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,0,7,Injectable Contraceptive,Hospital,2019-03-01,C4001 X AS27134
3,2019,4,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,2,1,Injectable Contraceptive,Hospital,2019-04-01,C4001 X AS27134
4,2019,5,INDENIE-DJUABLIN,ABENGOUROU,C4001,AS27134,31,11,Injectable Contraceptive,Hospital,2019-05-01,C4001 X AS27134


In [5]:
df.sort_values("year_month", inplace = True)

In [6]:
df["previous"] = df.groupby(["site_product"])["stock_distributed"].shift()

In [7]:
df = df.fillna(0)

In [8]:
df = df.reset_index()

In [9]:
y_true = df["stock_distributed"]
y_estimate1 = df["previous"]
y_estimate2 = df["average_monthly_consumption"]

Baseline Model 1

In [10]:
# Use values from previous month for each site_product as estimated values
mses = []

cv = TimeSeriesSplit(n_splits = 8, max_train_size = 20000, test_size = 4000)

for train_index, val_index in cv.split(y_true):
    print("Index:", train_index, "Length: ", len(train_index))
    y_1, y_2 = y_true[train_index], y_estimate1[train_index]
    mse = mean_squared_error(y_1, y_2)
    mses.append(mse)
     
print("MSE's:", mses)

average_mse = np.mean(mses)
std_mse = np.std(mses)

print("AVG MSE: ", average_mse)
print("STD MSE: ", std_mse)

Index: [   0    1    2 ... 3750 3751 3752] Length:  3753
Index: [   0    1    2 ... 7750 7751 7752] Length:  7753
Index: [    0     1     2 ... 11750 11751 11752] Length:  11753
Index: [    0     1     2 ... 15750 15751 15752] Length:  15753
Index: [    0     1     2 ... 19750 19751 19752] Length:  19753
Index: [ 3753  3754  3755 ... 23750 23751 23752] Length:  20000
Index: [ 7753  7754  7755 ... 27750 27751 27752] Length:  20000
Index: [11753 11754 11755 ... 31750 31751 31752] Length:  20000
MSE's: [2046.3037569944045, 1841.2454533728878, 1695.5124649025781, 1887.9581032184346, 1768.8733356958437, 1540.73425, 1590.17165, 1671.7651]
AVG MSE:  1755.3205142730185
STD MSE:  155.6079504246537


Baseline Model 2

In [11]:
# Use values from previous three months' average for each site_product as estimated values
mses = []
maes = []

cv = TimeSeriesSplit(n_splits = 8, max_train_size = 20000, test_size = 4000)

for train_index, val_index in cv.split(y_true):
    print("Index:", train_index, "Length: ", len(train_index))
    y_1, y_2 = y_true[train_index], y_estimate2[train_index]
    mse = mean_squared_error(y_1, y_2)
    mae = mean_absolute_error(y_1, y_2)
    mses.append(mse)
    maes.append(mae)
     
print("MSE's:", mses)
print("MAE's:", maes)

average_mse = np.mean(mses)
std_mse = np.std(mses)

average_mae = np.mean(maes)
std_mae = np.std(maes)

print("AVG MSE: ", average_mse)
print("STD MSE: ", std_mse)

print("AVG MAE: ", average_mae)
print("STD MAE: ", std_mae)

Index: [   0    1    2 ... 3750 3751 3752] Length:  3753
Index: [   0    1    2 ... 7750 7751 7752] Length:  7753
Index: [    0     1     2 ... 11750 11751 11752] Length:  11753
Index: [    0     1     2 ... 15750 15751 15752] Length:  15753
Index: [    0     1     2 ... 19750 19751 19752] Length:  19753
Index: [ 3753  3754  3755 ... 23750 23751 23752] Length:  20000
Index: [ 7753  7754  7755 ... 27750 27751 27752] Length:  20000
Index: [11753 11754 11755 ... 31750 31751 31752] Length:  20000
MSE's: [626.8057553956835, 599.2311363343222, 567.5134859184889, 631.0007617596648, 584.5340960866704, 528.26185, 537.4485, 575.1059]
MAE's: [7.9195310418332, 8.25370824197085, 8.228707564026205, 8.079286485113947, 7.8830557383688555, 7.72605, 7.4721, 7.4318]
AVG MSE:  581.2376856868538
STD MSE:  35.06659327225455
AVG MAE:  7.874279883914132
STD MAE:  0.29437358363910127
