In [30]:
# source:
# https://ourworldindata.org/vaccination
# https://ourworldindata.org/grapher/child-mortality-vs-share-of-children-immunized-against-diphtheria-pertussis-and-tetanus

In [114]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np


df = pd.read_csv('child-mortality-vs-share-of-children-immunized-against-diphtheria-pertussis-and-tetanus.csv')
df

Unnamed: 0,Entity,Code,Year,"Mortality rate, under-5 (per 1,000 live births)",DTP3 (% of one-year-olds immunized),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1962,34.46,,9351442.0,
2,Afghanistan,AFG,1963,33.87,,9543200.0,
3,Afghanistan,AFG,1964,33.31,,9744772.0,
4,Afghanistan,AFG,1965,32.76,,9956318.0,
...,...,...,...,...,...,...,...
57513,Zimbabwe,ZWE,1958,,,3537167.0,
57514,Zimbabwe,ZWE,1959,,,3654172.0,
57515,Zimbabwe,ZWE,2020,,,14862927.0,
57516,Zimbabwe,ZWE,2021,,,15092171.0,


In [115]:
df.isnull().sum()

Entity                                                 0
Code                                                3439
Year                                                   0
Mortality rate, under-5 (per 1,000 live births)    45573
DTP3 (% of one-year-olds immunized)                49855
Population (historical estimates)                   1862
Continent                                          57233
dtype: int64

In [116]:
# extract data from 1980 onwards
df=df[df['Year']>=1980]
df

Unnamed: 0,Entity,Code,Year,"Mortality rate, under-5 (per 1,000 live births)",DTP3 (% of one-year-olds immunized),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
19,Afghanistan,AFG,1980,24.15,4.0,13356500.0,
20,Afghanistan,AFG,1981,23.51,3.0,13171679.0,
21,Afghanistan,AFG,1982,22.86,5.0,12882518.0,
22,Afghanistan,AFG,1983,22.22,5.0,12537732.0,
...,...,...,...,...,...,...,...
57316,Zimbabwe,ZWE,2018,5.59,89.0,14438812.0,
57317,Zimbabwe,ZWE,2019,5.46,90.0,14645473.0,
57515,Zimbabwe,ZWE,2020,,,14862927.0,
57516,Zimbabwe,ZWE,2021,,,15092171.0,


In [109]:
df.isnull().sum()

Entity                                                 0
Code                                                2077
Year                                                   0
Mortality rate, under-5 (per 1,000 live births)     2923
DTP3 (% of one-year-olds immunized)                 4373
Population (historical estimates)                   1832
Continent                                          11751
dtype: int64

In [117]:
# When Code is Null, it represents continent or a specific region
df[df['Code'].isnull()]

Unnamed: 0,Entity,Code,Year,"Mortality rate, under-5 (per 1,000 live births)",DTP3 (% of one-year-olds immunized),Population (historical estimates),Continent
260,Africa,,1980,,5.0,476386225.0,
261,Africa,,1981,,11.0,490003904.0,
262,Africa,,1982,,12.0,504034176.0,
263,Africa,,1983,,18.0,518479759.0,
264,Africa,,1984,,24.0,533345151.0,
...,...,...,...,...,...,...,...
56358,Western Pacific,,2015,,94.0,,
56359,Western Pacific,,2016,,97.0,,
56360,Western Pacific,,2017,,95.0,,
56361,Western Pacific,,2018,,93.0,,


In [118]:
df=df[df['Code'].notnull()]
df=df[df['Mortality rate, under-5 (per 1,000 live births)'].notnull()]
df=df[df['DTP3 (% of one-year-olds immunized)'].notnull()]
df=df[df['Population (historical estimates)'].notnull()]
df

Unnamed: 0,Entity,Code,Year,"Mortality rate, under-5 (per 1,000 live births)",DTP3 (% of one-year-olds immunized),Population (historical estimates),Continent
19,Afghanistan,AFG,1980,24.15,4.0,13356500.0,
20,Afghanistan,AFG,1981,23.51,3.0,13171679.0,
21,Afghanistan,AFG,1982,22.86,5.0,12882518.0,
22,Afghanistan,AFG,1983,22.22,5.0,12537732.0,
23,Afghanistan,AFG,1984,21.58,16.0,12204306.0,
...,...,...,...,...,...,...,...
57313,Zimbabwe,ZWE,2015,6.20,87.0,13814642.0,Africa
57314,Zimbabwe,ZWE,2016,5.95,90.0,14030338.0,
57315,Zimbabwe,ZWE,2017,5.82,89.0,14236599.0,
57316,Zimbabwe,ZWE,2018,5.59,89.0,14438812.0,


In [119]:
df.isnull().sum()

Entity                                                0
Code                                                  0
Year                                                  0
Mortality rate, under-5 (per 1,000 live births)       0
DTP3 (% of one-year-olds immunized)                   0
Population (historical estimates)                     0
Continent                                          6816
dtype: int64

In [120]:
df.rename(columns={'Mortality rate, under-5 (per 1,000 live births)': 'mortality_rate', 'DTP3 (% of one-year-olds immunized)': 'immunized_per_cent'}, inplace=True)
df

Unnamed: 0,Entity,Code,Year,mortality_rate,immunized_per_cent,Population (historical estimates),Continent
19,Afghanistan,AFG,1980,24.15,4.0,13356500.0,
20,Afghanistan,AFG,1981,23.51,3.0,13171679.0,
21,Afghanistan,AFG,1982,22.86,5.0,12882518.0,
22,Afghanistan,AFG,1983,22.22,5.0,12537732.0,
23,Afghanistan,AFG,1984,21.58,16.0,12204306.0,
...,...,...,...,...,...,...,...
57313,Zimbabwe,ZWE,2015,6.20,87.0,13814642.0,Africa
57314,Zimbabwe,ZWE,2016,5.95,90.0,14030338.0,
57315,Zimbabwe,ZWE,2017,5.82,89.0,14236599.0,
57316,Zimbabwe,ZWE,2018,5.59,89.0,14438812.0,


In [122]:
model = smf.ols(formula = "mortality_rate ~ immunized_per_cent", data = df).fit()

print("Model trained!")

Model trained!


In [123]:
import joblib
model_filename = './vaccin_mortality_model.pkl'
joblib.dump(model, model_filename)
print("Model saved!")

Model saved!


In [124]:
model_loaded = joblib.load(model_filename)
print("We have loaded a model with the following parameters:")
print(model_loaded.params)

We have loaded a model with the following parameters:
Intercept             20.029493
immunized_per_cent    -0.184140
dtype: float64


In [125]:
print(model_loaded.summary())


                            OLS Regression Results                            
Dep. Variable:         mortality_rate   R-squared:                       0.535
Model:                            OLS   Adj. R-squared:                  0.535
Method:                 Least Squares   F-statistic:                     8047.
Date:                Sat, 02 Apr 2022   Prob (F-statistic):               0.00
Time:                        16:25:39   Log-Likelihood:                -19383.
No. Observations:                7006   AIC:                         3.877e+04
Df Residuals:                    7004   BIC:                         3.878e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             20.0295      0

In [128]:
# Let's write a function that loads and uses our model
def load_model_and_predict(vaccine):

    # Load the model from file and print basic information about it
    loaded_model = joblib.load(model_filename)
    print("We have loaded a model with the following parameters:")
    print(loaded_model.params)
    # Prepare data for the model
    inputs = {"immunized_per_cent":[vaccine]} 
    # Use the model to make a prediction
    predicted_mortality_rate = loaded_model.predict(inputs)[0]
    return predicted_mortality_rate
# Practice using our model
predicted_mortality_rate = load_model_and_predict(90)
print("Mortality rate, under-5 (per 1,000 live births):", predicted_mortality_rate)

We have loaded a model with the following parameters:
Intercept             20.029493
immunized_per_cent    -0.184140
dtype: float64
Mortality rate, under-5 (per 1,000 live births): 3.456906879972326
