In [7]:
# Import dependencies
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os

In [8]:
# Read the CSV into a Pandas DataFrame
dataset = pd.read_csv('Final_Dataset.csv')
dataset.head()

Unnamed: 0,region,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
0,East Asia & Pacific,2000,0.21,113,29.52,4.34,7178.89
1,Europe & Central Asia,2000,0.05,33,24.49,5.88,22220.7
2,Latin America & Caribbean,2000,0.29,99,56.91,2.53,11458.67
3,Middle East & North Africa,2000,0.38,113,44.46,2.48,13554.73
4,North America,2000,0.02,12,54.5,5.56,45146.13


In [9]:
#Refine DataFrame to only have South Asian data
n_america = dataset.loc[dataset['region'] == 'North America']
n_america.head()

Unnamed: 0,region,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita
4,North America,2000,0.02,12,54.5,5.56,45146.13
11,North America,2001,0.02,12,53.65,5.97,45166.49
18,North America,2002,0.02,13,53.72,6.31,45594.67
25,North America,2003,0.02,13,53.78,6.5,46429.4
32,North America,2004,0.02,13,53.38,6.58,47712.99


In [10]:
# Use Pandas get_dummies to convert categorical data
region = pd.get_dummies(n_america)
region.head(10)

Unnamed: 0,year,lifetime_risk,mortality_ratio,private_expenditure,government_expenditure,gdp_capita,region_North America
4,2000,0.02,12,54.5,5.56,45146.13,1
11,2001,0.02,12,53.65,5.97,45166.49,1
18,2002,0.02,13,53.72,6.31,45594.67,1
25,2003,0.02,13,53.78,6.5,46429.4,1
32,2004,0.02,13,53.38,6.58,47712.99,1
39,2005,0.02,13,53.16,6.6,48847.89,1
46,2006,0.03,13,52.21,6.78,49676.06,1
53,2007,0.03,13,52.04,6.9,50091.65,1
60,2008,0.03,14,51.05,7.22,49521.44,1
67,2009,0.03,14,50.25,7.88,47706.76,1


In [11]:
# Set x and y
X = region.drop("mortality_ratio", axis=1)
y = region[("mortality_ratio")]

In [12]:
# Split between train and test
from sklearn.model_selection import train_test_split

In [13]:
# Can I use historical data to predict recent data (and potentially future data?)
X_train = region[region['year'] < 2010].drop("mortality_ratio", axis=1)
y_train = region[region['year'] < 2010]['mortality_ratio']
X_test = region[region['year'] >= 2010].drop("mortality_ratio", axis=1)
y_test = region[region['year'] >= 2010]['mortality_ratio']

In [14]:
X_test.head()

Unnamed: 0,year,lifetime_risk,private_expenditure,government_expenditure,gdp_capita,region_North America
74,2010,0.03,49.87,7.94,48516.51,1
81,2011,0.03,49.8,7.9,48978.08,1
88,2012,0.03,49.88,7.89,49651.33,1
95,2013,0.03,49.55,7.93,50137.02,1
102,2014,0.02,19.09,12.86,51034.87,1


In [15]:
# Run Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [16]:
# Fit the model
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
# Print the coefficient and the intercept for the model
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [ 2.72760356e-01 -1.28479756e+01  2.35544456e-01  5.29566657e-01
 -1.28833567e-04  0.00000000e+00]
y-axis intercept:  -543.2503274917502


In [18]:
# Calculate the model.predict
predicted = model.predict(X_test)

#Calculate and print the mean squared error and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 2.0513613456815807
R-squared (R2): -8.231126055567113
