# Compute age-specific baselines for the STMF data  using our (Karlinsky & Kobak, 2021) model

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import matplotlib

from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
import datetime
import statsmodels.api as sm

In [3]:
df_stmf = pd.read_csv('https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv', skiprows=2)

df_stmf

Unnamed: 0,CountryCode,Year,Week,Sex,D0_14,D15_64,D65_74,D75_84,D85p,DTotal,R0_14,R15_64,R65_74,R75_84,R85p,RTotal,Split,SplitSex,Forecast
0,AUS,2015,1,m,17.166833,358.833167,250.0,436.0,413.0,1475.0,0.000386,0.002373,0.013133,0.045643,0.127992,0.006484,1,0,0
1,AUS,2015,1,f,13.968728,199.031272,183.0,350.0,704.0,1450.0,0.000332,0.001309,0.009350,0.031185,0.125849,0.006289,1,0,0
2,AUS,2015,1,b,31.135561,557.864439,433.0,786.0,1117.0,2925.0,0.000360,0.001840,0.011215,0.037832,0.126633,0.006386,1,0,0
3,AUS,2015,2,m,17.473384,334.526616,255.0,386.0,422.0,1415.0,0.000393,0.002212,0.013395,0.040409,0.130781,0.006220,1,0,0
4,AUS,2015,2,f,11.490405,186.509595,162.0,322.0,675.0,1357.0,0.000273,0.001227,0.008277,0.028690,0.120665,0.005886,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116482,USA,2022,50,f,224.000000,5373.000000,5417.0,7900.0,11683.0,30597.0,0.000399,0.002609,0.015671,0.042993,0.162771,0.009497,0,0,1
116483,USA,2022,50,b,493.000000,14148.000000,12587.0,16387.0,19144.0,62759.0,0.000429,0.003422,0.019232,0.049365,0.167164,0.009830,0,0,1
116484,USA,2022,51,m,220.000000,8113.000000,7052.0,8276.0,7493.0,31154.0,0.000374,0.003911,0.022834,0.055841,0.175288,0.009851,0,0,1
116485,USA,2022,51,f,194.000000,5028.000000,5256.0,7931.0,11452.0,29861.0,0.000346,0.002442,0.015206,0.043162,0.159553,0.009268,0,0,1


In [6]:
def predict(X):    
    # Fit regression model on pre-2020 data from 2015 on 
    ind = (X[:,0] >= 2015) & (X[:,0] < 2020) & (X[:,1]<53)
    m = int(np.max(X[ind,1]))
    onehot = np.zeros((np.sum(ind), m))
    for i,k in enumerate(X[ind,1]):
        onehot[i,int(k)-1] = 1
    predictors = np.concatenate((X[ind,:1], onehot), axis=1)
    reg = LinearRegression(fit_intercept=False).fit(predictors, X[ind,2])
            
    # Compute 2020 baseline
    ind2 = X[:,0] == 2020
    predictors2020 = np.concatenate((np.ones((m,1))*2020, np.eye(m)), axis=1)
    baseline = reg.predict(predictors2020)
    
    # Week 53 usually does not have enough data, so we'll use 
    # the same baseline value as for week 52
    baseline = np.concatenate((baseline, [baseline[-1]]))
    
    # Compute 2021 baseline
    predictors2021 = np.concatenate((np.ones((m,1))*2021, np.eye(m)), axis=1)
    baseline2021 = reg.predict(predictors2021)
    
    # Compute 2022 baseline
    predictors2022 = np.concatenate((np.ones((m,1))*2022, np.eye(m)), axis=1)
    baseline2022 = reg.predict(predictors2022)
    
    # Compute 2023 baseline
    predictors2023 = np.concatenate((np.ones((m,1))*2023, np.eye(m)), axis=1)
    baseline2023 = reg.predict(predictors2023)

    return baseline, baseline2021, baseline2022, baseline2023

In [7]:
countries = np.unique(df_stmf['CountryCode'])
agebands = ['D0_14', 'D15_64', 'D65_74', 'D75_84', 'D85p', 'DTotal']
sexes = ['m', 'f', 'b']

with open('baselines-stmf.csv', 'w') as f:
    for country in countries:
        print('.', end='')
        for ageband in agebands:
            for sex in sexes:
                X = df_stmf[(df_stmf['CountryCode']==country) & (df_stmf['Sex']==sex)]
                X = X[['Year','Week',ageband]].values
                baselines  = predict(X)

                for year in range(2020,2024): 
                    for i,b in enumerate(baselines[year-2020]):
                        f.write(f'{country}, {year}, {ageband}, {sex}, {i+1}, {b:.1f}\n')
print('')

......................................
