In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'College.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
college = pd.read_csv(file_content_stream)
college.head()

## disabling the 'FutureWarning' warning message
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
## Changing the variable 'Private' from a categorical to a numerical variable
college['Private'] = np.where(college['Private'] == 'No', 0 ,1)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [17]:
## defining the input (x) and target (y) variables
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']
## splitting the data into 80% trianing and 20% testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [18]:
## transforming the input variables in the training and testing dataset to a 0-1 scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## building linear regression model


In [19]:
## fitting the data to a linear regression model
lm_md = LinearRegression().fit(X_train, Y_train)

In [20]:
## predicting on the testing dataset
lm_preds = lm_md.predict(X_test)
lm_preds

array([ 6575.50060966,  2617.99670683,   398.20370707,  4194.57374335,
        4291.97053319,  4579.9400438 ,  2627.88870824,  4612.18466249,
         327.10559673, 10197.02393099,  3833.04574956,  3126.35375777,
         735.92240265,  1688.40043197,  3202.29339385,  4893.10823053,
        3764.53910102,  7998.46566191,  1103.65831189,  1577.15603435,
        4380.9024963 ,   409.15783381, 13189.6425703 ,  1679.15523363,
        2713.14730186,  4568.29401632,  9068.79031226,  6535.43185726,
        2320.14698468,  4492.33940817,   776.6433638 ,  1034.8465673 ,
        4627.34304367,  1296.9833608 ,   500.84877221,  2916.80614448,
         593.43667082,   544.89213503,  7099.16720474,  3116.65740998,
         184.64037536,  1405.53478077,  2975.94343816,  1741.97565164,
       10929.27488608,  2731.66507344,   107.06234189,  4740.3803971 ,
        4750.55906598,  1179.83131143,  -904.14788506,  1975.07509982,
        3074.66814315,  3964.62675579,  2404.7442677 ,  2525.55888361,
      

In [21]:
## calculating the MSE for this model
mse1 = np.mean((Y_test - lm_preds)**2)
print('The MSE of the Linear Regression Model is', mse1)

The MSE of the Linear Regression Model is 2950422.714033583


## building ridge regression model

In [22]:
## estimating the best lambda for ridge model (1st) and LASSO model (2nd)
ridge_cv = RidgeCV(alphas = np.linspace(0.001, 100), cv = 5).fit(X_train, Y_train)
lasso_cv = LassoCV(alphas = np.linspace(0.001, 100), normalize = True, cv = 5).fit(X_train, Y_train)

## extracting the best lambda value for both models
CV_lambda = ridge_cv.alpha_
cv_lambda = lasso_cv.alpha_
print('The optimal lambda for the ridge model is', CV_lambda)
print('The optimal lambda for the lasso model is', cv_lambda)

The optimal lambda for the ridge model is 0.001
The optimal lambda for the lasso model is 2.0417959183673466


In [29]:
## building the ridge model
ridge_md = Ridge(alpha = CV_lambda).fit(X_train, Y_train)

## building the lasso model and capturing coefficients
lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
lasso_md.coef_
              
## predicting on the testing dataset for the ridge model
ridge_pred = ridge_md.predict(X_test)
lasso_pred = lasso_md.predict(X_test)

## computing the MSE of both models
mse2 = np.mean(np.power(Y_test - ridge_pred, 2))
mse3 = np.mean(np.power(Y_test - lasso_pred, 2))

## printing MSE results
print('The MSE of the Linear Regression Model is', mse1)
print('The MSE of the Ridge Regression Model is', mse2)
print('The MSE of the LASSO Regression Model is', mse3)

The MSE of the Linear Regression Model is 2950422.714033583
The MSE of the Ridge Regression Model is 2950505.5710513885
The MSE of the LASSO Regression Model is 2974561.7142281863


In [30]:
mse1<mse2

True

In [31]:
mse1<mse3

True

Based on these MSE values, I would choose the Linear Regression model because it has the lowest MSE value.