In [25]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'College.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
college = pd.read_csv(file_content_stream)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [27]:
## Changing the variable 'Private' from a categorical to a numerical variable
college['Private'] = np.where(college['Private'] == 'No', 0 ,1)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [32]:
## defining the input (x) and target (y) variables
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']
## splitting the data into 80% trianing and 20% testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [36]:
## transforming the input variables in the training and testing dataset to a 0-1 scale
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

## building linear regression model


In [37]:
## fitting the data to a linear regression model
lm_md = LinearRegression().fit(X_train, Y_train)

In [38]:
## predicting on the testing dataset
lm_preds = lm_md.predict(X_test)
lm_preds

array([ 1581.44317055,  1265.13740611,   990.4652647 ,  2448.15621274,
        1961.66807393,  5341.68052556,  5048.84262328,  6579.13128536,
        2151.09586621,  1396.49560093,  8388.64827914,  1904.86698277,
         298.01726743,  3991.2530581 ,  1627.37595322,    82.25404747,
        3263.89521202,  2759.11476211,  3899.95189114,  1567.39749821,
        3174.44081517,  3081.01780992,  1912.780899  , 10564.78768582,
          44.817636  ,  1978.31531261,  5609.0786481 ,  2232.28257186,
        1765.55924051,  2780.36502072,  8817.66294072,  3449.34796595,
        2369.73593766,  1552.50404178,  1113.64197531,   675.55128691,
        1319.64891641,   787.83150739,  1951.87227116,   -38.53070915,
        -239.02379606,  5078.81724219,  7405.4383469 ,  1422.62627215,
        3326.44290761,  2131.85657892,   409.13210121,  5972.18335129,
         890.38976797,  2298.10907054,  7430.6857173 ,  6836.21861893,
         445.83403831, 21450.76050157,    76.18325269,  1499.98603227,
      

In [43]:
## calculating the MSE for this model
mse1 = np.mean((Y_test - lm_preds)**2)
print('The MSE of the Linear Regression Model is', mse1)

The MSE of the Linear Regression Model is 1988029.5262185128


## building ridge regression model

In [56]:
## estimating the best lambda
from numpy.linspace import linspace
ridge_cv = RidgeCV(np.linspace = (0.001, 100), cv = 5).fit(X_train, Y_train)

## extracting the best lambda value
CV_lambda = ridge_cv.alpha_
print('The best lambda of the ridge model is', CV_lambda)

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (2722587422.py, line 3)

In [None]:
## building the ridge model
ridge_md = Ridge(alpha = CV_lambda).fit(X_train, Y_train)

## predicting on the testing dataset
ridge_pred = ridge_md.predict(X_test)

## computing the MSE of the ridge regression model
mse2 = np.mean(np.power(Y_test - ridge_pred, 2))
print('The MSE of the Ridge Regression Model is', mse2)

In [None]:
## building LASSO 