In [1]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import auc as auc_temp

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set_theme(style='darkgrid')

import time

# Export dataFrame's as images
import dataframe_image as dfi

# import my utility methods for this project
import utils_practical_2 as my_utils

# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# On to Modeling ...

**This notebook picks up from the Data Investigation (see ```DataInvestigation.ipynb```)**

[Local file](DataInvestigation.ipynb)
[Github](https://github.com/fazeelgm/UCB_ML_AI_PracticalApp_II/blob/main/DataInvestigation.ipynb)

## Data Cleanup

The data cleansing results are as follows:

In [6]:
# vehicles_raw, vehicles_cleansed = my_utils.get_cleansed_data()

In [7]:
# vehicles_cleansed.info()

# On to Modeling ...

While investigating the different features of our dataset during the data investigation, I learned two things:

1. The data is very noisy with extreme outliers - I removed null data and outliers as much as possible
   * 66,180 samples were dropped, preserving 84.50% of the original data
3. To aid in this effort, I researched the used car marketplace to get some idea of pricing, important features that drive price and potential inventory segments

## Initial Hypothesis

Based on Price inspection, a potential hypothesis arose that the used car inventory is _segmented_ based on the following price bands:

![](images/candidate-price-segments.png)

In addition, market research suggested that typical used cars can be categoriezed into price ranges like Budget, Mid, Luxury, etc. that will be based on feature groups. I looked at ```<price, year, condition, odometer>``` combinations and saw that there was clustering behavior as shown by the scatter plots below:

![](images/scatter-price-odo-condition-budget.png)
![](images/scatter-price-odo-year-entry.png)

So, I will now use clustering techniques to see if we can observe natural clustering of features in our sample population.

# Final Modeling

## Data Cleaning

In [12]:
vehicles_raw, vehicles_cleansed = my_utils.get_cleansed_data()

Reading data/vehicles.csv ... Done: (426880, 18)

Cleansing price column ... 
... Removing price outliers using ModZ method ... 
... ModZ: 9450.0, med: 13950.0, const: 0.6745
... Time: 0.15195989608764648
... Removed 5,790 outliers
... Removing cars with price = 0 ...  Removed 32,895 rows
Done: (421090, 19) -> (388195, 19)

DropNA from columns: 
... year: 1,029 rows (0.27% of total): 388,195 -> 387,166
... manufacturer: 16,609 rows (4.28% of total): 388,195 -> 371,586
... fuel: 19,173 rows (4.94% of total): 388,195 -> 369,022
... title_status: 26,730 rows (6.89% of total): 388,195 -> 361,465
... odometer: 28,960 rows (7.46% of total): 388,195 -> 359,235
... transmission: 30,742 rows (7.92% of total): 388,195 -> 357,453
Done: (388195, 19) -> (360700, 19)

Dropping columns: ['mod_zscore', 'id', 'model']
... mod_zscore
... id
... model
Done: (360700, 19) -> (360700, 16)

Data Transformations:
... year float -> int: Done
... odometer float -> int: Done

Category Transformations:
... Conver

In [13]:
vehicles_cleansed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 360700 entries, 27 to 426879
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   region        360700 non-null  object  
 1   price         360700 non-null  int64   
 2   year          360700 non-null  int64   
 3   manufacturer  360700 non-null  category
 4   condition     223668 non-null  category
 5   cylinders     213522 non-null  category
 6   fuel          360700 non-null  category
 7   odometer      360700 non-null  int64   
 8   title_status  360700 non-null  category
 9   transmission  360700 non-null  category
 10  VIN           219818 non-null  object  
 11  drive         252582 non-null  category
 12  size          104338 non-null  category
 13  type          282300 non-null  category
 14  paint_color   257706 non-null  category
 15  state         360700 non-null  category
dtypes: category(11), int64(3), object(2)
memory usage: 20.3+ MB


In [14]:
vehicles_cleansed.columns

Index(['region', 'price', 'year', 'manufacturer', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'VIN', 'drive',
       'size', 'type', 'paint_color', 'state'],
      dtype='object')

In [15]:
# removed cols: 'region', 'manufacturer', 'VIN', 'paint_color', 'state'
drop_cols = ['region', 'manufacturer', 'VIN', 'paint_color', 'state']
features=['year', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status', 
          'transmission', 'drive', 'size', 'type']

In [16]:
# Create a copy of the clean data and remove unwanted columnss
data = vehicles_cleansed.copy()
data.drop(columns=drop_cols, inplace=True)

In [17]:
# remove nulls before splitting data
print('Dropping nulls: {} -> '.format(data.shape), end='')
data.dropna(subset=['condition', 'cylinders', 'drive', 'size', 'type'], axis='index', inplace=True)
print(' {}'.format(data.shape))

Dropping nulls: (360700, 11) ->  (78626, 11)


In [18]:
data.columns

Index(['price', 'year', 'condition', 'cylinders', 'fuel', 'odometer',
       'title_status', 'transmission', 'drive', 'size', 'type'],
      dtype='object')

## Create Train/Test splits

In [20]:
X = data[features]
y = data['price']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78626 entries, 31 to 426833
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   year          78626 non-null  int64   
 1   condition     78626 non-null  category
 2   cylinders     78626 non-null  category
 3   fuel          78626 non-null  category
 4   odometer      78626 non-null  int64   
 5   title_status  78626 non-null  category
 6   transmission  78626 non-null  category
 7   drive         78626 non-null  category
 8   size          78626 non-null  category
 9   type          78626 non-null  category
dtypes: category(8), int64(2)
memory usage: 2.4 MB


## Data Preparation for Modeling

In [24]:
selector = make_column_selector(dtype_include=['object', 'category'])
data.select_dtypes(include=['object', 'category']).columns

Index(['condition', 'cylinders', 'fuel', 'title_status', 'transmission',
       'drive', 'size', 'type'],
      dtype='object')

In [25]:
# Preprocess the data
# One-hot encode the 'condition' categorical features
# Scale numeric features
transformer = make_column_transformer(
    (OneHotEncoder(drop = 'first'), selector),
    remainder= StandardScaler()
)
transformer

## Baseline Regression Models

In [27]:
models = [
    LinearRegression(),
    Ridge(alpha=1.0),
    Lasso(alpha=0.1)
]

# save results for tabulation
results_baseline = []

In [28]:
# Evaluate models
def get_model_metrics_as_results(model_name, y_preds, y_test, score):
    """
    Build standardized results row given the predictions and y_test values

    :param model_name: Model name for labeling the row in the table
    :param y_preds: Predictions of y_test
    :param y_test: Test y
    :param score: Model Score is passed in
    :return: Returns single row of results summary table containing:
    
        [model_name, MAE, MSE, RMSE, R2_Score]
    """

    start_time = time.time()

    logging.debug(f'Working on {model_name}')

    # get metrics
    mae = mean_absolute_error(y_preds, y_test)
    mse = mean_squared_error(y_preds, y_test)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_preds, y_test)

    lapse = time.time() - start_time
    
    logging.debug(f'... {model_name}: MAE: {mae:,.4f}, MSE: {mse:,.4f}, RMSE: {rmse:,.4f}, R2: {r2:,.4f}, Score: {score:,.4f}, time: {lapse:,.4f}')

    return [model_name, mae, mse, rmse, score]

In [29]:
# logging.getLogger().setLevel(logging.DEBUG)

# results_baseline = []

# iterate over the models and build results DF
for model in models:
    clf = Pipeline([
        ('transformer', transformer),
        ('model', model)
    ])
    clf.fit(X_train, y_train)
    y_preds = clf.predict(X_test)

    model_name = clf.get_params()['model'].__class__.__name__
    score = clf.score(X_test, y_test)

    results_baseline.append(get_model_metrics_as_results(model_name, y_preds, y_test, score))

logging.getLogger().setLevel(logging.INFO)

In [30]:
# Generate results table
results_baseline_df = pd.DataFrame(results_baseline, 
                                   columns=['Model','MAE', 'MSE', 'RMSE', 'Score']
                                  ).set_index('Model')


# Export results for README
results_baseline_df_styled = my_utils.df_style_floats(results_baseline_df)
dfi.export(results_baseline_df_styled, 'images/results_baseline_table.png')
results_baseline_df_styled

Unnamed: 0_level_0,MAE,MSE,RMSE,Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LinearRegression,5946.1381,68680449.6276,8287.3669,0.4277
Ridge,5946.0982,68679488.711,8287.3089,0.4277
Lasso,5946.1342,68680502.5796,8287.3701,0.4277


## Model Tuning

In [32]:
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso()
}

param_grid = {
    'Ridge': {'alpha': [0.1, 1.0, 10.0]},
    'Lasso': {'alpha': [0.01, 0.1, 1.0]}    
}

In [33]:
# logging.getLogger().setLevel(logging.DEBUG)

In [34]:
# save results for tabulation
results_tuned = []
best_models = {}
best_params = {}
cv_scores = {}
cv_mses = {}

# Scale the data
X_train_scaled = transformer.fit_transform(X_train)
X_test_scaled = transformer.fit_transform(X_test)

# Run cross validation using GridSearchCV
cv=5
for model in models:
    clf = models[model]
    logging.debug(f'clf: {type(clf)}, params: {param_grid[model]}')
    grid = GridSearchCV(clf, param_grid=param_grid[model], cv=cv, scoring='neg_mean_squared_error')
    grid.fit(X_train_scaled, y_train)

    
    # Save the tuning metrics for this model class
    best_models[model] = grid.best_estimator_
    cv_score = cross_val_score(clf, X_train_scaled, y_train, cv=cv, scoring='neg_mean_squared_error')
    cv_scores[model] = cv_score
    cv_mses[model] = -np.mean(cv_score)
    best_params[model] = grid.best_params_
    
    logging.debug(f'Best alpha: {grid.best_params_}')
    logging.debug(f'CV Score: {cv_score}')
    logging.debug(f'CV MSE: {-np.mean(cv_score)}')
    logging.debug(f'CV Best Score: {-grid.best_score_}')
    logging.debug(f'CV Best RMSE: {np.sqrt(-grid.best_score_)}')

    y_preds = grid.best_estimator_.predict(X_test_scaled)
    score = grid.score(X_test_scaled, y_test)
    r2 = r2_score(y_test, y_preds)
    logging.debug(f'Score: {score}, r2: {r2}')

    results_tuned.append(get_model_metrics_as_results(model, y_preds, y_test, r2))

We add LinearRegression to the results manually as there is no need for cross-validation

In [36]:
# Handle LinearRegression manually
linear_model = LinearRegression()

linear_cv_score = cross_val_score(linear_model, X_train_scaled, y_train, cv=cv, scoring='neg_mean_squared_error')
logging.debug(f'Linear Regression CV MSE: {-np.mean(linear_cv_score)}')

# Train Linear Regression model
linear_model.fit(X_train_scaled, y_train)

# This is the best estimator for LinearRegression as there is no tuning
model='Linear Regression'
best_models[model] = linear_model
cv_scores[model] = linear_cv_score
cv_mses[model] = -np.mean(linear_cv_score)
best_params[model] = ''

# Make predictions
linear_pred = linear_model.predict(X_test_scaled)

results_tuned.append(get_model_metrics_as_results('Linear Regression', linear_pred, y_test, r2_score(y_test, linear_pred)))

In [37]:
logging.getLogger().setLevel(logging.INFO)

In [38]:
# Generate results table
results_tuned_df = pd.DataFrame(results_tuned,
                                columns=['Model - Tuned','MAE', 'MSE', 'RMSE', 'Score']
                               ).set_index('Model - Tuned')

# Add BestParams as a new column
results_tuned_df['CV MSE'] = results_tuned_df.index.map(cv_mses)
results_tuned_df['CV Best Params'] = results_tuned_df.index.map(best_params)

# Export results for README
results_tuned_df_styled = my_utils.df_style_floats(results_tuned_df)
dfi.export(results_tuned_df_styled, 'images/results_tuned_table.png')
results_tuned_df_styled

Unnamed: 0_level_0,MAE,MSE,RMSE,Score,CV MSE,CV Best Params
Model - Tuned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ridge,5962.2476,68590535.2792,8281.9403,0.4285,70600488.0763,{'alpha': 10.0}
Lasso,5962.8228,68595984.7091,8282.2693,0.4284,70589729.0634,{'alpha': 1.0}
Linear Regression,5962.5318,68593107.626,8282.0956,0.4285,70602026.8617,
