In [1]:
## 1.a
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'danhtran358-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'College.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
college = pd.read_csv(file_content_stream)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [2]:
## 1.b
## Numeralize categorical variable Private
college['Private'] = np.where(college['Private'] == 'No', 0, 1)
college['Private']

Abilene Christian University      1
Adelphi University                1
Adrian College                    1
Agnes Scott College               1
Alaska Pacific University         1
                                 ..
Worcester State College           0
Xavier University                 1
Xavier University of Louisiana    1
Yale University                   1
York College of Pennsylvania      1
Name: Private, Length: 777, dtype: int64

In [3]:
## 1.c
## Input and target variable
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']

## Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [4]:
## 1.d
## Scale input variables to range 0-1
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [5]:
## 1.e
## Build linear regression model
linear_md = LinearRegression().fit(X_train, Y_train)

## Predict on test input data
linear_pred = linear_md.predict(X_test)

## Calculate MSE
linear_mse = np.mean(np.power(linear_pred - Y_test, 2))
linear_mse

3096495.422518892

In [6]:
## 1.f
## Initiate lambdas to find optimal lambda
lambdas = np.linspace(0.001, 100, num = 100)

## Find optimal lambda using Ridge cross validation
ridge_cv = RidgeCV(normalize = True, cv = 5, alphas = lambdas).fit(X_train, Y_train)
ridge_optimal_lambda = ridge_cv.alpha_

## Build Ridge model
ridge_md = Ridge(normalize = True, alpha = ridge_optimal_lambda).fit(X_train, Y_train)

## Predict on test input data
ridge_pred = ridge_md.predict(X_test)

## Calculate MSE
ridge_mse = np.mean(np.power(ridge_pred - Y_test, 2))
ridge_mse

3084065.850109443

In [7]:
## 1.g
## Use the same list of lambdas as RidgeCV
## Find optimal lambda use LASSO cross validation
lasso_cv = LassoCV(normalize = True, cv = 5, alphas = lambdas).fit(X_train, Y_train)
lasso_optimal_lambda = lasso_cv.alpha_
lasso_optimal_lambda

## Build LASSO model
lasso_md = Lasso(normalize = True, alpha = lasso_optimal_lambda).fit(X_train, Y_train)

## Predict on test input data
lasso_pred = lasso_md.predict(X_test)

## Calculate MSE
lasso_mse = np.mean(np.power(lasso_pred - Y_test, 2))
lasso_mse

2965259.1466004057

In [8]:
## 1.h
## I would use LASSO model because it has the lowest MSE
print('Linear model MSE: ', linear_mse)
print('Ridge model MSE:  ', ridge_mse)
print('LASSO model MSE:  ', lasso_mse)

Linear model MSE:  3096495.422518892
Ridge model MSE:   3084065.850109443
LASSO model MSE:   2965259.1466004057
