Created by: Dylan Eggemeyer

Created on: 4/10/2023



Notebook references code discussed by Ken Jee in the following YouTube video: https://www.youtube.com/watch?v=7O4dpR9QMIM

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

df = pd.read_csv('final_dataset.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'YEAR', 'STATE_AB', 'STATE', 'INFANT_MORTALITY_RATE',
       'INFANT_DEATHS', 'POVERTY_RATE', 'MEDIAN_INCOME', 'UNINSURED_RATE',
       'GRADUATION_RATE', 'VIOLENT_CRIME_RATE'],
      dtype='object')

In [27]:
# Linear Regression for Poverty Rate
X = df[['POVERTY_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))



Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  2.6120222498503405
   Coefficients [25.65870828]
   Score 0.39749419188220636
MAE is   0.7419979334591933
RMSE is  0.9122757680932425
MSE is  0.8322470770501157
R^2     0.39749419188220636

Results for linear regression on test data
MAE is   0.6396329784995215
RMSE is  0.7844720034901672
MSE is  0.615396324259877
R^2     0.4533949769395169


In [30]:
# Linear Regression for Median Income
X = df[['MEDIAN_INCOME']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))



Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  10.475368396940212
   Coefficients [-7.15846517e-05]
   Score 0.41462540903441525
MAE is   0.7364628307880804
RMSE is  0.8992127455516629
MSE is  0.8085835617625596
R^2     0.41462540903441525

Results for linear regression on test data
MAE is   0.6581230515427428
RMSE is  0.8202771991485074
MSE is  0.6728546834429201
R^2     0.40235952789936


In [29]:
# Linear Regression for Uninsured Rate
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))



Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  4.83673482925167
   Coefficients [13.26918171]
   Score 0.11376094547611693
MAE is   0.9289446761315049
RMSE is  1.1064224074385254
MSE is  1.2241705436820622
R^2     0.11376094547611693

Results for linear regression on test data
MAE is   0.8579324863755584
RMSE is  1.0351115878041326
MSE is  1.0714559992063926
R^2     0.04831535700368883


In [31]:
# Linear Regression for Graduation Rate
X = df[['GRADUATION_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  5.054349119057871
   Coefficients [1.03859242]
   Score -0.0023946365919218504
MAE is   0.937731715567034
RMSE is  1.1766976536853007
MSE is  1.384617368188492
R^2     -0.0023946365919218504

Results for linear regression on test data
MAE is   0.8101300251423276
RMSE is  1.08941684667199
MSE is  1.186829065812742
R^2     -0.054160877004967745


In [32]:
# Linear Regression for Median Income
X = df[['VIOLENT_CRIME_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  4.988535747626972
   Coefficients [0.00252129]
   Score 0.10605980582759122
MAE is   0.8977700635550363
RMSE is  1.1112192412248054
MSE is  1.234808202068232
R^2     0.10605980582759122

Results for linear regression on test data
MAE is   0.75605539518937
RMSE is  0.9897015407699402
MSE is  0.9795091398023936
R^2     0.1299840528076599


In [33]:
# Multiple Linear Regression for All Independent Variables
X = df[['POVERTY_RATE', 'MEDIAN_INCOME', 'UNINSURED_RATE',
       'GRADUATION_RATE', 'VIOLENT_CRIME_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  1.7536739264247583
   Coefficients [ 9.44889134e+00 -4.51784406e-05  3.92241810e+00  5.97620958e+00
  1.15004543e-03]
   Score 0.48769837364788315
MAE is   0.6793632159122984
RMSE is  0.8412176720603425
MSE is  0.7076471717866218
R^2     0.48769837364788315

Results for linear regression on test data
MAE is   0.6035842496105147
RMSE is  0.7744951496926002
MSE is  0.5998427368973633
R^2     0.4672099261743613


In [34]:
# Multiple Linear Regression for All Independent Variables less Graduation Rate
X = df[['POVERTY_RATE', 'MEDIAN_INCOME', 'UNINSURED_RATE', 'VIOLENT_CRIME_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  7.648242097460576
   Coefficients [ 6.75493445e+00 -5.00457300e-05  3.72668892e+00  7.39968209e-04]
   Score 0.4558304480369957
MAE is   0.7052831011907893
RMSE is  0.8669871074218076
MSE is  0.751666644435633
R^2     0.4558304480369957

Results for linear regression on test data
MAE is   0.617669415671225
RMSE is  0.7801965396328119
MSE is  0.6087066404550139
R^2     0.4593368628856547


In [35]:
# Multiple Linear Regression for All Independent Variables less Graduation Rate and Unisured Rate
X = df[['POVERTY_RATE', 'MEDIAN_INCOME', 'VIOLENT_CRIME_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#fit the model
lm = LinearRegression()

lm.fit(X_train, y_train)

#predict training data
y_pred = lm.predict(X)

#training results
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lm.intercept_)
print('   Coefficients', lm.coef_)
print('   Score', lm.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

#test results
y_test_pred = lm.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  7.999539109826692
   Coefficients [ 6.90093523e+00 -5.25589569e-05  1.00006719e-03]
   Score 0.4525799426777395
MAE is   0.7018334176330111
RMSE is  0.8695726529752568
MSE is  0.7561565988024262
R^2     0.4525799426777395

Results for linear regression on test data
MAE is   0.5997222323306417
RMSE is  0.7672728667521137
MSE is  0.5887076520540069
R^2     0.4771002896160075


In [50]:
# Polynomial Linear Regression (poly = 4)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

X = df[['POVERTY_RATE', 'MEDIAN_INCOME', 'UNINSURED_RATE',
       'GRADUATION_RATE', 'VIOLENT_CRIME_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run the Model
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
poly4 = PolynomialFeatures(degree=4, include_bias=False)
scale = StandardScaler()
lr_model = LinearRegression()

stages = [('imp_mean', imp_mean),
          ('poly4', poly4),
          ('scale', scale),
          ('lr_model', lr_model),
         ]
pipe_model = Pipeline(stages)

pipe_model.fit(X_train, y_train)

y_pred = pipe_model.predict(X_train)
print('Results for pipeline linear regression on training data')
#print('  Default settings')
#print('Internal parameters:')
#print('   Bias is ', pipe_model.predict([[0]], [[0]], [[0]], [[0]], [[0]]))
#print('   Coefficients', lr_model.coef_)
print('   Score', pipe_model.score(X_train, y_train))

print('MAE is  ', mean_absolute_error(y_train, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_train, y_pred)))
print('MSE is ', mean_squared_error(y_train, y_pred))
print('R^2    ', r2_score(y_train,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for pipeline linear regression on training data
   Score 0.9368393358466842
MAE is   0.2014403322175146
RMSE is  0.30118612718088594
MSE is  0.09071308320622082
R^2     0.9368393358466842

Results for pipeline linear regression on test data
MAE is   2.438265213681786
RMSE is  5.247254520331766
MSE is  27.533680001142148
R^2     -23.455862342149366


In [47]:
# Polynomial Linear Regression (poly = 3)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

X = df[['POVERTY_RATE', 'MEDIAN_INCOME', 'UNINSURED_RATE',
       'GRADUATION_RATE', 'VIOLENT_CRIME_RATE']]
y = df['INFANT_MORTALITY_RATE']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run the Model
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
poly3 = PolynomialFeatures(degree=3, include_bias=False)
scale = StandardScaler()
lr_model = LinearRegression()

stages = [('imp_mean', imp_mean),
          ('poly3', poly3),
          ('scale', scale),
          ('lr_model', lr_model),
         ]
pipe_model = Pipeline(stages)

pipe_model.fit(X_train, y_train)

y_pred = pipe_model.predict(X_train)
print('Results for pipeline linear regression on training data')
#print('  Default settings')
#print('Internal parameters:')
#print('   Bias is ', pipe_model.predict([[0]], [[0]], [[0]], [[0]], [[0]]))
#print('   Coefficients', lr_model.coef_)
print('   Score', pipe_model.score(X_train, y_train))

print('MAE is  ', mean_absolute_error(y_train, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_train, y_pred)))
print('MSE is ', mean_squared_error(y_train, y_pred))
print('R^2    ', r2_score(y_train,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for pipeline linear regression on training data
   Score 0.7617177973995608
MAE is   0.4564403574761555
RMSE is  0.5850020972280311
MSE is  0.34222745376119473
R^2     0.7617177973995608

Results for pipeline linear regression on test data
MAE is   0.6373930903220107
RMSE is  0.8786631943416208
MSE is  0.7720490090906209
R^2     0.3142535147161649
