In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'College.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
college = pd.read_csv(file_content_stream)
college.head()

## disabling the 'FutureWarning' warning message
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
## Changing the variable 'Private' from a categorical to a numerical variable
college['Private'] = np.where(college['Private'] == 'No', 0 ,1)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [3]:
## defining the input (x) and target (y) variables
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']
## splitting the data into 80% trianing and 20% testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [4]:
## transforming the input variables in the training and testing dataset to a 0-1 scale
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

## building linear regression model


In [5]:
## fitting the data to a linear regression model
lm_md = LinearRegression().fit(X_train, Y_train)

In [6]:
## predicting on the testing dataset
lm_preds = lm_md.predict(X_test)
lm_preds

array([ 1.82838620e+03,  3.77413746e+03,  1.21556043e+03,  5.35800892e+03,
       -5.93713739e+02,  1.26733288e+03,  1.44389967e+01,  1.06347328e+04,
        3.31592506e+03, -4.84188029e+02,  4.68994147e+03,  9.96629359e+03,
        3.32083353e+03,  2.32041259e+03,  1.58199449e+03,  7.08438090e+02,
        8.28319684e+03,  7.55310726e+02,  6.73871601e+02,  2.63701887e+03,
        1.61439923e+03,  9.06690689e+02,  1.47855890e+03,  2.01308828e+03,
        1.07343723e+04,  1.36265754e+03,  9.45697078e+02,  1.59332850e+03,
        3.35163345e+03,  1.27579421e+03,  1.49693618e+03, -2.21042907e+02,
        2.81948784e+03,  3.77894432e+03,  2.06879728e+03, -5.00307196e+02,
        2.55808222e+03,  1.02119322e+03,  6.55016922e+02,  2.10732357e+03,
        2.33546281e+03, -1.40751966e+02,  7.12622611e+03,  2.17957228e+03,
        7.65642622e+03,  2.27502031e+03,  6.63102784e+02, -2.25952136e+02,
        2.23343258e+03,  2.44585801e+03,  3.45220346e+03,  3.72550933e+03,
       -1.68194564e+02,  

In [7]:
## calculating the MSE for this model
mse1 = np.mean((Y_test - lm_preds)**2)
print('The MSE of the Linear Regression Model is', mse1)

The MSE of the Linear Regression Model is 2568311.2165202214


## building ridge regression model

In [8]:
## estimating the best lambda for ridge model (1st) and LASSO model (2nd)
#ridge_cv = RidgeCV(alphas = (), np.linspace = (0.001, 100),cv = 5).fit(X_train, Y_train)
ridge_cv = RidgeCV(alphas = np.linspace(0.001, 100), cv = 5).fit(X_train, Y_train)
lasso_cv = LassoCV(alphas = np.linspace(0.001, 100), normalize = True, cv = 5).fit(X_train, Y_train)

## extracting the best lambda value for both models
CV_lambda = ridge_cv.alpha_
cv_lambda = lasso_cv.alpha_
print('The optimal lambda for the ridge model is', CV_lambda)
print('The optimal lambda for the lasso model is', cv_lambda)

The optimal lambda for the ridge model is 6.123387755102041
The optimal lambda for the lasso model is 0.001


In [14]:
## building the ridge model
ridge_md = Ridge(alpha = CV_lambda).fit(X_train, Y_train)

## predicting on the testing dataset for the ridge model
ridge_pred = ridge_md.predict(X_test)

## creating a list to store the results
coeffs= []

## building the lasso model and capturing coefficients
lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
coeffs.append(lasso_md.coef_)
              
## computing the MSE of both models
mse2 = np.mean(np.power(Y_test - ridge_pred, 2))
##mse3 =
print('The MSE of the Ridge Regression Model is', mse2)
print('')

The MSE of the Ridge Regression Model is 2565131.7119970093



In [13]:
## creating a dataframe from array/list to store results
df_coeffs = pd.DataFrame(coeffs, columns = [['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Gradrad.Rate']])
df_coeffs

Unnamed: 0,Private,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,S.F.Ratio,Gradrad.Rate
0,-881.90127,0.677146,-0.143619,0.108229,0.368131,0.481247,-0.073894,-39.297468,26.872288


## building LASSO regression model