In [1]:
import boto3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'CarPrice_Assignment.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the datafile
car_price = pd.read_csv(file_content_stream)
car_price.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [2]:
# Defining input and target variables
x= car_price[['wheelbase','enginesize','horsepower','compressionratio','peakrpm','citympg','highwaympg']]
y= car_price['price']

#split into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [8]:
# estimating lambda for lasso
lasso_cv = LassoCV(alphas = [0.001,0.01,0.1,1,10], normalize = True, cv = 5).fit(x_train,y_train)

#extracting best lambda
cv_lambda = lasso_cv.alpha_
print('the estimated lambda for the lasso model is', cv_lambda)

#building lasso
lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(x_train,y_train)
lasso_md.coef_

the estimated lambda for the lasso model is 10.0


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set para

array([ 126.73716034,  106.10429183,   48.83184582,  300.6971973 ,
          1.72825938, -149.59769723,   -0.        ])

In [9]:
#dropping highwaympg
x_train = x_train.drop(columns=['highwaympg'],axis =1)
x_test = x_test.drop(columns=['highwaympg'],axis =1)

def l2_normalization(x):
    x_mean = np.mean(x)
    l2 = np.sqrt(sum(x**2))
    return (x - x_mean) / l2

x_train = x_train.apply(l2_normalization, axis=1)
x_test = x_test.apply(l2_normalization, axis=1)

In [10]:
#linear regression
lm_md = LinearRegression().fit(x_train,y_train)

#predicting on test
lm_pred = lm_md.predict(x_test)

#computing mse of the lm model
mse1 = np.mean(np.power(y_test-lm_pred,2))
print('the mse of the model is',mse1)

the mse of the model is 13589745.134505862


In [11]:
#ridge regression
ridge_cv = RidgeCV(alphas = [0.001,0.01,0.1,1,10], cv = 5).fit(x_train,y_train)

#extract lambda
cv_lambda = ridge_cv.alpha_
print('the best lambda for the ridge model is',cv_lambda)

#build the model
ridge_md = Ridge(alpha= cv_lambda).fit(x_train,y_train)

#predicting on test
ridge_pred = ridge_md.predict(x_test)

#computing the mse of the ridge reg model
mse2 = np.mean(np.power(y_test-ridge_pred,2))
print('the mse of the ridge model is ',mse2)

the best lambda for the ridge model is 0.001
the mse of the ridge model is  13122015.592519142


In [None]:
# the mse of the ridge model is smaller, so it is prefered to predict car price.