# Compute age-specific baselines for the STMF data  using our (Karlinsky & Kobak, 2021) model

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import matplotlib

from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
import datetime
import statsmodels.api as sm

In [2]:
df_stmf = pd.read_csv('https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv', skiprows=2)

df_stmf

Unnamed: 0,CountryCode,Year,Week,Sex,D0_14,D15_64,D65_74,D75_84,D85p,DTotal,R0_14,R15_64,R65_74,R75_84,R85p,RTotal,Split,SplitSex,Forecast
0,AUS,2015,1,m,17.166833,358.833167,250.0,436.0,413.0,1475.0,0.000386,0.002373,0.013133,0.045680,0.128282,0.006484,1,0,0
1,AUS,2015,1,f,13.968728,199.031272,183.0,350.0,704.0,1450.0,0.000332,0.001309,0.009350,0.031246,0.126006,0.006290,1,0,0
2,AUS,2015,1,b,31.135561,557.864439,433.0,786.0,1117.0,2925.0,0.000360,0.001840,0.011215,0.037886,0.126838,0.006387,1,0,0
3,AUS,2015,2,m,17.473384,334.526616,255.0,386.0,422.0,1415.0,0.000393,0.002212,0.013395,0.040441,0.131078,0.006221,1,0,0
4,AUS,2015,2,f,11.490405,186.509595,162.0,322.0,675.0,1357.0,0.000273,0.001227,0.008277,0.028746,0.120815,0.005887,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127786,USA,2024,48,f,194.000000,4116.000000,4788.0,7311.0,9721.0,26130.0,0.000352,0.001999,0.013353,0.035649,0.128322,0.008042,0,0,1
127787,USA,2024,48,b,423.000000,11177.000000,10974.0,15036.0,16251.0,53861.0,0.000375,0.002702,0.016187,0.040759,0.135249,0.008376,0,0,1
127788,USA,2024,49,m,189.000000,6690.000000,6296.0,7802.0,6537.0,27514.0,0.000328,0.003221,0.019713,0.047626,0.147226,0.008649,0,0,1
127789,USA,2024,49,f,149.000000,4117.000000,4706.0,7229.0,9841.0,26042.0,0.000271,0.001999,0.013125,0.035249,0.129906,0.008015,0,0,1


In [3]:
def predict(X):    
    # Fit regression model on pre-2020 data from 2015 on 
    ind = (X[:,0] >= 2015) & (X[:,0] < 2020) & (X[:,1]<53)
    m = int(np.max(X[ind,1]))
    onehot = np.zeros((np.sum(ind), m))
    for i,k in enumerate(X[ind,1]):
        onehot[i,int(k)-1] = 1
    predictors = np.concatenate((X[ind,:1], onehot), axis=1)
    reg = LinearRegression(fit_intercept=False).fit(predictors, X[ind,2])
            
    # Compute 2020 baseline
    ind2 = X[:,0] == 2020
    predictors2020 = np.concatenate((np.ones((m,1))*2020, np.eye(m)), axis=1)
    baseline = reg.predict(predictors2020)
    
    # Week 53 usually does not have enough data, so we'll use 
    # the same baseline value as for week 52
    baseline = np.concatenate((baseline, [baseline[-1]]))
    
    # Compute 2021 baseline
    predictors2021 = np.concatenate((np.ones((m,1))*2021, np.eye(m)), axis=1)
    baseline2021 = reg.predict(predictors2021)
    
    # Compute 2022 baseline
    predictors2022 = np.concatenate((np.ones((m,1))*2022, np.eye(m)), axis=1)
    baseline2022 = reg.predict(predictors2022)
    
    # Compute 2023 baseline
    predictors2023 = np.concatenate((np.ones((m,1))*2023, np.eye(m)), axis=1)
    baseline2023 = reg.predict(predictors2023)

    return baseline, baseline2021, baseline2022, baseline2023

In [4]:
countries = np.unique(df_stmf['CountryCode'])
agebands = ['D0_14', 'D15_64', 'D65_74', 'D75_84', 'D85p', 'DTotal']
sexes = ['m', 'f', 'b']

with open('baselines-stmf.csv', 'w') as f:
    for country in countries:
        print('.', end='')
        for ageband in agebands:
            for sex in sexes:
                X = df_stmf[(df_stmf['CountryCode']==country) & (df_stmf['Sex']==sex)]
                X = X[['Year','Week',ageband]].values
                baselines  = predict(X)

                for year in range(2020,2024): 
                    for i,b in enumerate(baselines[year-2020]):
                        f.write(f'{country}, {year}, {ageband}, {sex}, {i+1}, {b:.1f}\n')
print('')

......................................
