In [25]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'College.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
college = pd.read_csv(file_content_stream)
college.head()

## disabling the 'FutureWarning' warning message
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [26]:
## Changing the variable 'Private' from a categorical to a numerical variable
college['Private'] = np.where(college['Private'] == 'No', 0 ,1)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [27]:
## defining the input (x) and target (y) variables
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']
## splitting the data into 80% trianing and 20% testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [28]:
## transforming the input variables in the training and testing dataset to a 0-1 scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## building linear regression model


In [29]:
## fitting the data to a linear regression model
lm_md = LinearRegression().fit(X_train, Y_train)

In [30]:
## predicting on the testing dataset
lm_preds = lm_md.predict(X_test)
lm_preds

array([ 3.20107001e+03,  2.72864885e+03,  1.38193619e+04,  1.51607352e+03,
        1.61002571e+03,  4.53227673e+03,  2.80810416e+03,  5.49269762e+03,
        3.75888813e+03,  1.83126568e+03,  1.14155467e+03,  1.72033696e+03,
        5.37812866e+03,  2.37955380e+03,  6.86398854e+02,  1.12705882e+04,
       -2.11708295e+02,  2.38088579e+03,  2.17290893e+03,  2.45200872e+03,
       -6.52241124e+02,  8.67605943e+02,  6.98230823e+02,  1.95804053e+03,
        4.24162561e+03,  4.46412533e+03,  2.60401188e+03, -1.29441595e+03,
        4.09462293e+03,  3.98216572e+03,  1.34319426e+03,  1.31602666e+03,
        2.99698950e+03,  1.19906217e+03,  2.89596467e+03,  7.16089927e+03,
        8.52772515e+02,  4.75221556e+03,  6.29192407e+03,  5.28889554e+03,
        4.39172817e+03,  1.69244654e+03,  3.04980266e+03,  1.41581320e+01,
        5.37085936e+02,  1.23539764e+01,  1.30048988e+03,  1.30390043e+02,
        6.37940508e+02,  7.23192816e+02,  5.06681715e+03,  1.85784791e+03,
        1.47639243e+04,  

In [31]:
## calculating the MSE for this model
mse1 = np.mean((Y_test - lm_preds)**2)
print('The MSE of the Linear Regression Model is', mse1)

The MSE of the Linear Regression Model is 3213814.188961649


## building ridge and lasso regression model

In [39]:
## estimating the best lambda for ridge model (1st) and LASSO model (2nd)
ridge_cv = RidgeCV(alphas = np.linspace(0.001, 100, num = 100), cv = 5).fit(X_train, Y_train)
lasso_cv = LassoCV(alphas = np.linspace(0.001, 100, num = 100), normalize = True, cv = 5).fit(X_train, Y_train)

## extracting the best lambda value for both models
cv_lambda_ridge = ridge_cv.alpha_
cv_lambda_lasso = lasso_cv.alpha_
print('The optimal lambda for the ridge model is', cv_lambda_ridge)
print('The optimal lambda for the lasso model is', cv_lambda_lasso)

The optimal lambda for the ridge model is 0.001
The optimal lambda for the lasso model is 3.0312727272727273


In [40]:
## building the ridge model
ridge_md = Ridge(alpha = CV_lambda).fit(X_train, Y_train)

## building the lasso model and capturing coefficients
lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
lasso_md.coef_
              
## predicting on the testing dataset for the ridge model
ridge_pred = ridge_md.predict(X_test)
lasso_pred = lasso_md.predict(X_test)

## computing the MSE of both models
mse2 = np.mean(np.power(Y_test - ridge_pred, 2))
mse3 = np.mean(np.power(Y_test - lasso_pred, 2))

## printing MSE results
print('The MSE of the Linear Regression Model is', mse1)
print('The MSE of the Ridge Regression Model is', mse2)
print('The MSE of the LASSO Regression Model is', mse3)

The MSE of the Linear Regression Model is 3213814.188961649
The MSE of the Ridge Regression Model is 3213713.5716644274
The MSE of the LASSO Regression Model is 3353322.7970733885


In [42]:
mse2<mse1<mse3

True

Based on these MSE values, I would choose the Ridge Regression model to predict the number of applications a university will receive because it has the lowest MSE value.