In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'CarPrice_Assignment.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
car_price = pd.read_csv(file_content_stream)
car_price.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [14]:
## disabling the 'FutureWarning' because 1000 iterations
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [52]:
## creating a list to store the results
coeffs= []

## for loop to estimate optimal lambda and to use optimal lambda to estimate coefficients
for i in range(0,1000):
    
    # print(i)
    
    ## defining input and target variables
    X = car_price[['wheelbase', 'enginesize', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']]
    Y = car_price['price']

    ## splitting data into training and testing datasets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
    
    ## estimating lambda for lasso by CV with 5 folds
    lasso_cv = LassoCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100], normalize = True, cv = 5).fit(X_train, Y_train)

    ## extracting the best lambda value via cross validation
    cv_lambda = lasso_cv.alpha_

    ## building the lasso model and capturing coefficients
    lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
    coeffs.append(lasso_md.coef_)

## creating a dataframe from array to store results
df_coeffs = pd.DataFrame(coeffs)
df_coeffs

Unnamed: 0,0,1,2,3,4,5,6
0,213.940169,88.650545,272.049756,54.916127,1.590180,-75.844254,-26.825013
1,213.135965,96.970922,267.619753,41.207492,1.587146,-133.737922,-0.000000
2,193.938865,104.456663,308.499687,41.159483,1.962538,-146.295397,-2.853757
3,167.891706,115.877913,245.057715,52.041870,2.198440,-58.896195,0.000000
4,150.661557,96.001840,320.396058,63.747191,1.770147,-60.043177,-59.412079
...,...,...,...,...,...,...,...
995,175.785941,107.229370,345.538413,49.618194,2.034833,-146.579233,-0.000000
996,189.702385,95.501740,262.962847,53.282437,1.538616,-138.128469,0.000000
997,171.692078,94.966206,359.127059,63.424548,1.802358,-188.942672,101.310657
998,169.334167,97.825229,263.709882,58.433036,1.896896,-89.218376,-0.000000


In [56]:
## counting all 0s in each column
count = (df_coeffs ==0).sum()
count

0      0
1      0
2      0
3      0
4      0
5     15
6    533
dtype: int64

In [None]:
## removing feature with 500+ 0
df_co