In [1]:
# Import dependencies
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os

In [2]:
# Read the CSV into a Pandas DataFrame
dataset = pd.read_csv('Final_Dataset.csv')
dataset.head()

Unnamed: 0,region,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
0,East Asia & Pacific,2000,0.21,113,29.52,4.34,7178.89
1,Europe & Central Asia,2000,0.05,33,24.49,5.88,22220.7
2,Latin America & Caribbean,2000,0.29,99,56.91,2.53,11458.67
3,Middle East & North Africa,2000,0.38,113,44.46,2.48,13554.73
4,North America,2000,0.02,12,54.5,5.56,45146.13


In [3]:
dataset.corr()


Unnamed: 0,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
year,1.0,-0.11515,-0.126779,-0.133786,0.164555,0.098643
lifetime_risk,-0.11515,1.0,0.989129,0.385841,-0.49034,-0.51478
mortality_ratio,-0.126779,0.989129,1.0,0.466092,-0.584008,-0.604825
private_expenditure,-0.133786,0.385841,0.466092,1.0,-0.684432,-0.362882
government_expenditure,0.164555,-0.49034,-0.584008,-0.684432,1.0,0.863571
gdp_capita,0.098643,-0.51478,-0.604825,-0.362882,0.863571,1.0


In [4]:
# Use Pandas get_dummies to convert categorical data
region = pd.get_dummies(dataset)
region.head(10)

Unnamed: 0,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita,region_East Asia & Pacific,region_Europe & Central Asia,region_Latin America & Caribbean,region_Middle East & North Africa,region_North America,region_South Asia,region_Sub-Saharan Africa
0,2000,0.21,113,29.52,4.34,7178.89,1,0,0,0,0,0,0
1,2000,0.05,33,24.49,5.88,22220.7,0,1,0,0,0,0,0
2,2000,0.29,99,56.91,2.53,11458.67,0,0,1,0,0,0,0
3,2000,0.38,113,44.46,2.48,13554.73,0,0,0,1,0,0,0
4,2000,0.02,12,54.5,5.56,45146.13,0,0,0,0,1,0,0
5,2000,1.45,388,73.61,0.87,2520.24,0,0,0,0,0,1,0
6,2000,4.72,846,60.33,1.72,2379.29,0,0,0,0,0,0,1
7,2001,0.2,110,29.9,4.36,7386.65,1,0,0,0,0,0,0
8,2001,0.05,32,24.36,6.04,22723.1,0,1,0,0,0,0,0
9,2001,0.27,95,56.79,2.6,11377.76,0,0,1,0,0,0,0


In [5]:
# Set x and y
X = region.drop("mortality_ratio", axis=1)
y = region[("mortality_ratio")]

In [6]:
# Split between train and test
from sklearn.model_selection import train_test_split

In [7]:
# Can I use historical data to predict recent data (and potentially future data?)
X_train = region[region['year'] < 2010].drop("mortality_ratio", axis=1)
y_train = region[region['year'] < 2010]['mortality_ratio']
X_test = region[region['year'] >= 2010].drop("mortality_ratio", axis=1)
y_test = region[region['year'] >= 2010]['mortality_ratio']

In [8]:
X_test.head()

Unnamed: 0,year,lifetime_risk,private_expenditure,government_expenditure,gdp_capita,region_East Asia & Pacific,region_Europe & Central Asia,region_Latin America & Caribbean,region_Middle East & North Africa,region_North America,region_South Asia,region_Sub-Saharan Africa
70,2010,0.13,30.64,4.38,11964.95,1,0,0,0,0,0,0
71,2010,0.03,22.26,7.03,27035.97,0,1,0,0,0,0,0
72,2010,0.19,55.07,3.51,13701.07,0,0,1,0,0,0,0
73,2010,0.28,46.06,2.53,16802.83,0,0,0,1,0,0,0
74,2010,0.03,49.87,7.94,48516.51,0,0,0,0,1,0,0


In [9]:
# Run Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [10]:
# Fit the model
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
# Print the coefficient and the intercept for the model
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [-2.40666256e+00  1.57486205e+02 -1.00061080e+00  5.79612164e+00
  1.23716766e-03 -7.27476217e+00 -9.49951456e+01  2.07680658e+00
 -9.30974071e+00 -1.06615140e+02  1.37856928e+02  7.82610536e+01]
y-axis intercept:  4895.492526476632


In [12]:
# Calculate the model.predict
predictions_test = model.predict(X_test)
predictions_train = model.predict(X_train)

In [16]:
# Calculate the model.score (R squared)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)
print(f"R-squared (R2) Train: {training_score}")
print(f"R-squared (R2) Test: {testing_score}")

R-squared (R2) Train: 0.9997803078696305
R-squared (R2) Test: 0.9933822839298257


In [17]:
# Run mean scored error
mse_train = mean_squared_error(y_train, predictions_train)
mse_test = mean_squared_error(y_test, predictions_test)

print(f"Mean Squared Error (MSE) Train: {mse_train}")
print(f"Mean Squared Error (MSE) Test: {mse_test}")


Mean Squared Error (MSE) Train: 12.680327800341137
Mean Squared Error (MSE) Test: 230.5879876177151
