In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set_theme(style='darkgrid')

import time

# import the common utility functions created as part of this assignment
import utils_practical_2 as my_utils

# Get the Data

In [3]:
vehicles_raw, vehicles_cleansed = my_utils.get_cleansed_data()

Reading data/vehicles.csv ... Done: (426880, 18)

Cleansing price column ... 
... Removing price outliers using ModZ method ... 
... ModZ: 9450.0, med: 13950.0, const: 0.6745
... Time: 0.17550396919250488
... Removed 5,790 outliers
... Removing cars with price = 0 ...  Removed 32,895 rows
Done: (421090, 19) -> (388195, 19)

DropNA from columns: 
... year: 1,029 rows (0.27% of total): 388,195 -> 387,166
... manufacturer: 16,609 rows (4.28% of total): 388,195 -> 371,586
... fuel: 19,173 rows (4.94% of total): 388,195 -> 369,022
... title_status: 26,730 rows (6.89% of total): 388,195 -> 361,465
... odometer: 28,960 rows (7.46% of total): 388,195 -> 359,235
... transmission: 30,742 rows (7.92% of total): 388,195 -> 357,453
Done: (388195, 19) -> (360700, 19)

Dropping columns: ['mod_zscore', 'id', 'model']
... mod_zscore
... id
... model
Done: (360700, 19) -> (360700, 16)

Returned Raw(426,880x19) and Cleansed(360,700x16) data
Dataset reduced by 66,180 rows (preserved 84.50% of total)


In [13]:
vehicles_cleansed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 360700 entries, 27 to 426879
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   region        360700 non-null  object 
 1   price         360700 non-null  int64  
 2   year          360700 non-null  float64
 3   manufacturer  360700 non-null  object 
 4   condition     223668 non-null  object 
 5   cylinders     213522 non-null  object 
 6   fuel          360700 non-null  object 
 7   odometer      360700 non-null  float64
 8   title_status  360700 non-null  object 
 9   transmission  360700 non-null  object 
 10  VIN           219818 non-null  object 
 11  drive         252582 non-null  object 
 12  size          104338 non-null  object 
 13  type          282300 non-null  object 
 14  paint_color   257706 non-null  object 
 15  state         360700 non-null  object 
dtypes: float64(2), int64(1), object(13)
memory usage: 46.8+ MB


In [23]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import auc as auc_temp

In [25]:
selector = make_column_selector(dtype_include=object)

In [29]:
transformer = make_column_transformer(
    (OneHotEncoder(drop = 'first'), selector),
    remainder= StandardScaler()
)
transformer

# Clustering

## DBSCAN

In [133]:
print('vehicles_cleansed.shape: {}', vehicles_cleansed.shape)
vehicles_dedupe_vin = vehicles_cleansed.drop_duplicates(keep='first', inplace=False)
print('vehicles_dedupe_vin.shape: {}', vehicles_dedupe_vin.shape)
data = vehicles_dedupe_vin.sample(frac=0.2, replace=False, random_state=42)
print('data.shape: {}', data.shape)

vehicles_cleansed.shape: {} (360700, 16)
vehicles_dedupe_vin.shape: {} (315839, 16)
data.shape: {} (63168, 16)


In [135]:
data_trans = transformer.fit_transform(data)
print('data_trans.shape: {}', data_trans.shape)

data_trans.shape: {} (63168, 27108)


In [137]:
data_trans

<63168x27108 sparse matrix of type '<class 'numpy.float64'>'
	with 838596 stored elements in Compressed Sparse Row format>

In [139]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

print('Starting DBSCAN')
start_time = time.time()

db = DBSCAN(eps=1.0, min_samples=10).fit(data_trans)
labels = db.labels_

end_time = time.time()
execution_time = end_time - start_time
print('Execution time: {:.2f} seconds'.format(execution_time))

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Starting DBSCAN
Execution time: 171.13 seconds
Estimated number of clusters: 15
Estimated number of noise points: 62965


In [141]:
labels, indices, counts = np.unique(db.labels_, return_index=True, return_counts=True)
labels, indices, counts

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 array([   0,   77,  254,  620,  971, 3126, 2971, 4376, 4685, 5837, 6864,
        7861, 3488, 1596, 3171, 4074]),
 array([62965,    14,    12,    19,    10,    24,    12,    14,    14,
           12,    15,    10,    12,    15,    10,    10]))

In [143]:
data['cluster'] = db.labels_

In [145]:
data.cluster.value_counts()

cluster
-1     62965
 4        24
 2        19
 12       15
 9        15
 0        14
 6        14
 7        14
 1        12
 5        12
 11       12
 8        12
 3        10
 13       10
 14       10
 10       10
Name: count, dtype: int64

In [155]:
data.query('cluster == 0')

Unnamed: 0,region,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state,cluster
205946,lansing,6995,2012.0,ford,,,gas,202345.0,clean,automatic,,,,,,mi,0
206682,lansing,7995,2010.0,ford,,,gas,207891.0,clean,automatic,,,,,,mi,0
206299,lansing,7995,2015.0,ford,,,gas,59168.0,clean,automatic,,,,,,mi,0
206664,lansing,15995,2017.0,ford,,,gas,35898.0,clean,automatic,,,,,,mi,0
206296,lansing,9995,2010.0,ford,,,gas,169105.0,clean,automatic,,,,,,mi,0
206043,lansing,7995,2012.0,ford,,,gas,261044.0,clean,automatic,,,,,,mi,0
206535,lansing,14995,2014.0,ford,,,gas,148425.0,clean,automatic,,,,,,mi,0
206630,lansing,4495,2013.0,ford,,,gas,113082.0,clean,automatic,,,,,,mi,0
206662,lansing,6495,2008.0,ford,,,gas,187542.0,clean,automatic,,,,,,mi,0
205827,lansing,6995,2011.0,ford,,,gas,213302.0,clean,automatic,,,,,,mi,0
