In [1]:
# Import dependencies
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os

In [2]:
# Read the CSV into a Pandas DataFrame
dataset = pd.read_csv('Final_Dataset.csv')
dataset.head()

Unnamed: 0,region,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
0,East Asia & Pacific,2000,0.21,113,29.52,4.34,7178.89
1,Europe & Central Asia,2000,0.05,33,24.49,5.88,22220.7
2,Latin America & Caribbean,2000,0.29,99,56.91,2.53,11458.67
3,Middle East & North Africa,2000,0.38,113,44.46,2.48,13554.73
4,North America,2000,0.02,12,54.5,5.56,45146.13


In [3]:
#Refine DataFrame to only have South Asian data
s_asia = dataset.loc[dataset['region'] == 'South Asia']
s_asia.head()

Unnamed: 0,region,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
5,South Asia,2000,1.45,388,73.61,0.87,2520.24
12,South Asia,2001,1.35,369,75.8,0.83,2579.88
19,South Asia,2002,1.25,350,75.69,0.82,2627.92
26,South Asia,2003,1.16,331,77.08,0.75,2767.64
33,South Asia,2004,1.07,314,77.21,0.73,2926.16


In [4]:
# Use Pandas get_dummies to convert categorical data
region = pd.get_dummies(s_asia)
s_asia.head(10)

Unnamed: 0,region,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
5,South Asia,2000,1.45,388,73.61,0.87,2520.24
12,South Asia,2001,1.35,369,75.8,0.83,2579.88
19,South Asia,2002,1.25,350,75.69,0.82,2627.92
26,South Asia,2003,1.16,331,77.08,0.75,2767.64
33,South Asia,2004,1.07,314,77.21,0.73,2926.16
40,South Asia,2005,0.99,296,76.35,0.76,3130.57
47,South Asia,2006,0.92,281,76.6,0.74,3346.11
54,South Asia,2007,0.85,265,76.03,0.75,3588.23
61,South Asia,2008,0.79,252,74.37,0.78,3669.83
68,South Asia,2009,0.74,239,72.51,0.86,3887.94


In [5]:
# Set x and y
X = region.drop("mortality_ratio", axis=1)
y = region[("mortality_ratio")]

In [6]:
# Split between train and test
from sklearn.model_selection import train_test_split

In [7]:
# Can I use historical data to predict recent data (and potentially future data?)
X_train = region[region['year'] < 2010].drop("mortality_ratio", axis=1)
y_train = region[region['year'] < 2010]['mortality_ratio']
X_test = region[region['year'] >= 2010].drop("mortality_ratio", axis=1)
y_test = region[region['year'] >= 2010]['mortality_ratio']

In [8]:
X_test.head()

Unnamed: 0,year,lifetime_risk,private_expenditure,government_expenditure,gdp_capita,region_South Asia
75,2010,0.69,72.16,0.84,4173.31,1
82,2011,0.64,69.77,0.9,4371.64,1
89,2012,0.6,70.25,0.9,4547.65,1
96,2013,0.56,74.78,0.86,4758.14,1
103,2014,0.53,73.96,0.84,5022.28,1


In [9]:
# Run Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [10]:
# Fit the model
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
# Print the coefficient and the intercept for the model
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [-3.48781047e+00  1.53815690e+02 -6.40578460e-02 -3.89766318e+00
 -6.19859737e-03  0.00000000e+00]
y-axis intercept:  7164.389366935581


In [21]:
# Calculate the model.predict
predicted = model.predict(X_test)

mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 37.40286205753507
R-squared (R2): 0.8535932332204781


In [13]:
# Calculate the model.predict

predictions_test = model.predict(X_test)
#predictions_train = model.predict(X_train)


In [16]:
# Calculate the model.score (R squared)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"R-squared (R2) Train: {training_score}")
print(f"R-squared (R2) Test: {testing_score}")

R-squared (R2) Train: 0.9999526851447225
R-squared (R2) Test: 0.8535932332204781


In [19]:
# Run mean scored error
#mse_train = mean_squared_error(y_train, predictions_train)
mse_test = mean_squared_error(y_test, predictions_test)

#print(f"Mean Squared Error (MSE) Train: {mse_train}")
print(f"Mean Squared Error (MSE) Test: {mse_test}")


Mean Squared Error (MSE) Test: 37.40286205753507
