In [1]:
# Import dependencies
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os

In [2]:
# Read the CSV into a Pandas DataFrame
dataset = pd.read_csv('Final_Dataset.csv')
dataset.head()

Unnamed: 0,region,year,number_of_deaths,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
0,East Asia & Pacific,2000,34000,0.21,113,29.52,4.34,7178.89
1,Europe & Central Asia,2000,3400,0.05,33,24.49,5.88,22220.7
2,Latin America & Caribbean,2000,12000,0.29,99,56.91,2.53,11458.67
3,Middle East & North Africa,2000,8700,0.38,113,44.46,2.48,13554.73
4,North America,2000,510,0.02,12,54.5,5.56,45146.13


In [9]:
dataset.corr()


Unnamed: 0,year,number_of_deaths,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
year,1.0,-0.081779,-0.11515,-0.126779,-0.133786,0.164555,0.098643
number_of_deaths,-0.081779,1.0,0.961941,0.985009,0.490734,-0.575261,-0.599932
lifetime_risk,-0.11515,0.961941,1.0,0.989129,0.385841,-0.49034,-0.51478
mortality_ratio,-0.126779,0.985009,0.989129,1.0,0.466092,-0.584008,-0.604825
private_expenditure,-0.133786,0.490734,0.385841,0.466092,1.0,-0.684432,-0.362882
government_expenditure,0.164555,-0.575261,-0.49034,-0.584008,-0.684432,1.0,0.863571
gdp_capita,0.098643,-0.599932,-0.51478,-0.604825,-0.362882,0.863571,1.0


In [3]:
# Use Pandas get_dummies to convert categorical data
region = pd.get_dummies(dataset)
region.head(10)

Unnamed: 0,year,number_of_deaths,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita,region_East Asia & Pacific,region_Europe & Central Asia,region_Latin America & Caribbean,region_Middle East & North Africa,region_North America,region_South Asia,region_Sub-Saharan Africa
0,2000,34000,0.21,113,29.52,4.34,7178.89,1,0,0,0,0,0,0
1,2000,3400,0.05,33,24.49,5.88,22220.7,0,1,0,0,0,0,0
2,2000,12000,0.29,99,56.91,2.53,11458.67,0,0,1,0,0,0,0
3,2000,8700,0.38,113,44.46,2.48,13554.73,0,0,0,1,0,0,0
4,2000,510,0.02,12,54.5,5.56,45146.13,0,0,0,0,1,0,0
5,2000,147000,1.45,388,73.61,0.87,2520.24,0,0,0,0,0,1,0
6,2000,236000,4.72,846,60.33,1.72,2379.29,0,0,0,0,0,0,1
7,2001,32000,0.2,110,29.9,4.36,7386.65,1,0,0,0,0,0,0
8,2001,3300,0.05,32,24.36,6.04,22723.1,0,1,0,0,0,0,0
9,2001,11000,0.27,95,56.79,2.6,11377.76,0,0,1,0,0,0,0


In [4]:
# Set x and y
X = region.drop("mortality_ratio", axis=1)
y = region[("mortality_ratio")]

In [5]:
# Split between train and test
from sklearn.model_selection import train_test_split

In [6]:
# Can I use historical data to predict recent data (and potentially future data?)
X_train = region[region['year'] < 2010].drop("mortality_ratio", axis=1)
y_train = region[region['year'] < 2010]['mortality_ratio']
X_test = region[region['year'] >= 2010].drop("mortality_ratio", axis=1)
y_test = region[region['year'] >= 2010]['mortality_ratio']

In [7]:
X_test.head()

Unnamed: 0,year,number_of_deaths,lifetime_risk,private_expenditure,government_expenditure,gdp_capita,region_East Asia & Pacific,region_Europe & Central Asia,region_Latin America & Caribbean,region_Middle East & North Africa,region_North America,region_South Asia,region_Sub-Saharan Africa
70,2010,23000,0.13,30.64,4.38,11964.95,1,0,0,0,0,0,0
71,2010,2100,0.03,22.26,7.03,27035.97,0,1,0,0,0,0,0
72,2010,8900,0.19,55.07,3.51,13701.07,0,0,1,0,0,0,0
73,2010,8400,0.28,46.06,2.53,16802.83,0,0,0,1,0,0,0
74,2010,620,0.03,49.87,7.94,48516.51,0,0,0,0,1,0,0


In [9]:
# Run Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [10]:
# Fit the model
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
# Print the coefficient and the intercept for the model
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [-9.93152868e-01  7.49245809e-04  1.37133714e+02 -1.34235099e-02
  4.61886496e+00 -4.19819704e-04  5.21435497e+00 -2.98405141e+01
  1.13338245e+01  1.65357837e+01 -3.07601663e+01  4.47595133e+01
 -1.72427961e+01]
y-axis intercept:  2020.8758496293772


In [27]:
# Calculate the model.predict
predictions_test = model.predict(X_test)
predictions_train = model.predict(X_train)

In [44]:
# Calculate the model.score (R squared)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)
print(f"R-squared (R2) Train: {training_score}")
print(f"R-squared (R2) Test: {testing_score}")

R-squared (R2) of Training: 0.9999530487543377
R-squared (R2) of Testing: 0.9991611032326877


In [28]:
# Run mean scored error
mse_train = mean_squared_error(y_train, predictions_train)
mse_test = mean_squared_error(y_test, predictions_test)

print(f"Mean Squared Error (MSE) Train: {mse_train}")
print(f"Mean Squared Error (MSE) Test: {mse_test}")


Mean Squared Error (MSE) Train: 2.709961365622207
Mean Squared Error (MSE) Test: 29.230555578739903


In [13]:
#This is a testing space - remove before publishing
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

Mean Squared Error (MSE): 29.230555578739903
R-squared (R2 ): 0.9991611032326877
