## setup hyperparameters

In [1]:
# what percentage of the data are we training against
# default: 0.3
MAX_ROWS_PARAMETER = 100
TRAINING_PERCENT_PARAMETER = 0.3
RIDGE_ALPHA_PARAMETER = 10000.0
DEBUG_PARAMETER = False

## load and clean the dataset

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import time

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
from sklearn.utils import shuffle

set_config(display="diagram")


In [3]:
auto = pd.read_csv('/Users/davidstange/Development/github/data/practical_application_II_starter/data/vehicles.csv')

In [4]:
auto = auto.dropna()
if DEBUG_PARAMETER:
    auto

In [5]:
def concat_dummy(df: pd.DataFrame, feature: str, prefix: str) -> pd.DataFrame:
    return pd.concat([df, pd.get_dummies(df[feature]).add_prefix(prefix) ], axis=1)   

# create dummies for manufacturer, condition, cylinders, fuel, title_status, transmission, drive, size, type, paint_color
auto = concat_dummy(auto, 'manufacturer', 'dum_manu_')
auto = concat_dummy(auto, 'condition', 'dum_cond_')
auto = concat_dummy(auto, 'cylinders', 'dum_cyl_')
auto = concat_dummy(auto, 'fuel', 'dum_fuel_')
auto = concat_dummy(auto, 'title_status', 'dum_title_')
auto = concat_dummy(auto, 'transmission', 'dum_trans_')
auto = concat_dummy(auto, 'drive', 'dum_drive_')
auto = concat_dummy(auto, 'size', 'dum_size_')
auto = concat_dummy(auto, 'type', 'dum_type_')
auto = concat_dummy(auto, 'paint_color', 'dum_color_')

if DEBUG_PARAMETER:
    auto

## create training set

In [6]:
all_indices = range(0, len(auto))
all_indices = shuffle(all_indices, random_state=27)

# HACK: the full 30k recordset takes too long, scaling back to first 1000 records 
if MAX_ROWS_PARAMETER > 0 and MAX_ROWS_PARAMETER < len(all_indices):
    all_indices = all_indices[:MAX_ROWS_PARAMETER]

split_index = int(len(all_indices) * TRAINING_PERCENT_PARAMETER)

training_indices, dev_indices = np.split(all_indices, [split_index])
auto_training = auto.iloc[training_indices]
auto_develop = auto.iloc[dev_indices]

TARGET_FEATURE = 'price'
auto_X_train = auto_training.drop([TARGET_FEATURE], axis = 1)
auto_X_test = auto_develop.drop([TARGET_FEATURE], axis = 1)
auto_y_train = auto_training[TARGET_FEATURE]
auto_y_test = auto_develop[TARGET_FEATURE]
if DEBUG_PARAMETER:
    print(f'training dataset: {len(auto_training)}, development dataset: {len(auto_develop)}')

## generate training features

In [7]:
# define the training features
cols = auto.columns
numeric_features = ['year', 'odometer']

manufacturer_features = cols[ cols.str.startswith("dum_manu_") ].tolist()
condition_features = cols[ cols.str.startswith("dum_cond_") ].tolist()
cylinders_features = cols[ cols.str.startswith("dum_cyl_") ].tolist()
fuel_features = cols[ cols.str.startswith("dum_fuel_") ].tolist()
title_status_features = cols[ cols.str.startswith("dum_title_") ].tolist()
transmission_features = cols[ cols.str.startswith("dum_trans_") ].tolist()
drive_features = cols[ cols.str.startswith("dum_drive_") ].tolist()
size_features = cols[ cols.str.startswith("dum_size_") ].tolist()
type_features = cols[ cols.str.startswith("dum_type_") ].tolist()
paint_color_features = cols[ cols.str.startswith("dum_color") ].tolist()
if DEBUG_PARAMETER:    
    print(f'Total Features:\n' + \
    f' - manufacturer_features={len(manufacturer_features)}, \n' + \
    f' - condition_features={len(condition_features)}, \n' + \
    f' - cylinders_features={len(cylinders_features)}, \n' + \
    f' - fuel_features={len(fuel_features)}, \n' + \
    f' - title_status_features={len(title_status_features)}, \n' + \
    f' - transmission_features={len(transmission_features)}, \n' + \
    f' - drive_features={len(drive_features)}, \n' + \
    f' - size_features={len(size_features)}, \n' + \
    f' - type_features={len(type_features)}, \n' + \
    f' - paint_color_features={len(paint_color_features)}, ')

training_features = numeric_features + condition_features + cylinders_features + fuel_features + title_status_features + transmission_features + size_features + type_features

## simple regression with ridge with hyperparameter

In [9]:
# describe iteration
alpha = RIDGE_ALPHA_PARAMETER

# train
start = time.time()
pipeline_model = Pipeline([
	('transform',  PolynomialFeatures(degree=3, include_bias=False)),
  	('scale', StandardScaler()),
	('regression', Ridge(alpha=alpha))
])
pipeline_model.fit(auto_X_train[training_features], auto_y_train)

ridge_train_mae = mean_absolute_error(pipeline_model.predict(auto_X_train[training_features]), auto_y_train)
ridge_train_mse = mean_squared_error(pipeline_model.predict(auto_X_train[training_features]), auto_y_train)
ridge_test_mae = mean_absolute_error(pipeline_model.predict(auto_X_test[training_features]), auto_y_test)
ridge_test_mse = mean_squared_error(pipeline_model.predict(auto_X_test[training_features]), auto_y_test)

# results
print(f'Ridge training with alpha={alpha}')
print(f'value               TRAINING\tTEST\nmean_absolute_error {ridge_train_mae:.1f}\t{ridge_test_mae:.1f}\nmean_squared_error  {ridge_train_mse:.1f}\t{ridge_test_mse:.1f}')


Ridge training with alpha=10000.0
value               TRAINING	TEST
mean_absolute_error 7568.8	8395.0
mean_squared_error  86496748.0	115692944.4


## regression with GridSearchCV

In [None]:
# parameters
candidate_params = {'regression__alpha': 10**np.linspace(-5, 4, 100)}

# train
# training_features = dummy_features
start = time.time()
model = Pipeline([
	('transform',  PolynomialFeatures(degree=3, include_bias=False)),
  ('scale', StandardScaler()),
	('regression', Ridge())
])

### pipeline with GridSearchCV
model_finder = GridSearchCV(estimator=model,
                            param_grid=candidate_params,
                            scoring="neg_mean_squared_error",
                            cv=[[training_indices, dev_indices]])
model_finder.fit(auto[training_features], auto[TARGET_FEATURE])

# performance
end = time.time()
print(name)
print(f'Elapsed time: {end-start:.4f}, total compleity={len(training_features) * len(auto_training) * len(candidate_params)}')

# output
print("Best parameters found: ", model_finder.best_params_)
print("Best cross-validation score: ", model_finder.best_score_)
if DEBUG_PARAMETER:
  model_finder.cv_results_

## lasso

In [9]:
start = time.time()
scaled_lasso_model = Pipeline([
	('transform',  PolynomialFeatures(degree=3, include_bias=False)),
    ('scale', StandardScaler()),
    ('lasso', Lasso())
])

parameters_to_try = {'lasso__alpha': 10**np.linspace(-4, 4, 100)}

lasso_model_finder = GridSearchCV(estimator = scaled_lasso_model,
                               param_grid = parameters_to_try,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])
lasso_model_finder.fit(auto[training_features], auto[TARGET_FEATURE])
lasso_model_finder.best_estimator_.named_steps['lasso'].coef_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

array([ 0., -0., -0., ...,  0.,  0., -0.])

In [None]:
lasso_train_mae = mean_absolute_error(lasso_model_finder.predict(auto_X_train[training_features]), auto_y_train)
lasso_train_mse = mean_squared_error(lasso_model_finder.predict(auto_X_train[training_features]), auto_y_train)
lasso_test_mae = mean_absolute_error(lasso_model_finder.predict(auto_X_test[training_features]), auto_y_test)
lasso_test_mse = mean_squared_error(lasso_model_finder.predict(auto_X_test[training_features]), auto_y_test)

# sum of the squares to the sum of the absolute values
if DEBUG_PARAMETER:
    print(f'\n|  | lasso | {lasso_train_mae:.1f} | {lasso_train_mse:.1f} | {lasso_test_mae:.1f} | {lasso_test_mse:.1f} |')

best_model = lasso_model_finder.best_estimator_

errors = pd.DataFrame([best_model.named_steps["lasso"].coef_], columns = best_model.named_steps["transform"].get_feature_names_out())
errors[errors.columns[(abs(errors) > 0.00001).any()]]


|  | lasso | 8374.1 | 111339849.7 | 7249.5 | 81741339.2 |


Unnamed: 0,year^2 dum_size_full-size,year dum_cyl_8 cylinders dum_size_full-size,year dum_fuel_diesel dum_size_full-size,odometer dum_cyl_4 cylinders^2,odometer dum_fuel_gas dum_trans_automatic,dum_trans_automatic dum_size_full-size dum_type_truck
0,44.268527,154.196663,1097.631023,-48.84292,-1181.865161,1079.080239


## Visualizations

In [34]:
charts = auto[(auto.odometer < 150000) & (auto.price < 120000)]
charts['age'] = 2021 - charts.year
px.scatter(charts, x='price', y='age', color='size', title='Price versus Age with Size')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [35]:
px.scatter(charts, x='price', y='odometer', color='cylinders', title='Price versus odometers with cylinders')