In [1]:
import copy
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

#### Pip Installs

- pip install numpy
- pip install pandas
- pip install -U scikit-learn

In [2]:
# converting mileage total to corresponding rank value
# Reference: https://www.subarubr.com/what-is-good-mileage-for-a-used-car/#:~:text=General%20Guidelines:,to%20harsh%20or%20abusive%20conditions.
def mileage_rank(data):
    if data < 0:
        AssertionError('Invalid non-negative values found!')
    if data < 200000:
        return np.ceil(data / 50000)
    return 5.0

In [3]:
# retrieving UAE Used Car Prices & Features (10K+ Listings) dataset
data = pd.read_csv('../data/uae_used_cars_10k.csv')
data.head()

Unnamed: 0,Make,Model,Year,Price,Mileage,Body Type,Cylinders,Transmission,Fuel Type,Color,Location,Description
0,toyota,camry,2016,47819,156500,Sedan,4,Automatic Transmission,Gasoline,Black,Dubai,"2016 toyota camry with Rear camera, Leather se..."
1,kia,sorento,2013,61250,169543,SUV,4,Automatic Transmission,Gasoline,Grey,Abu Dhabi,"2013 kia sorento with Sunroof, Adaptive cruise..."
2,mini,cooper,2023,31861,221583,Soft Top Convertible,4,Automatic Transmission,Gasoline,Grey,Dubai,"2023 mini cooper with Adaptive cruise control,..."
3,nissan,altima,2016,110322,69754,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2016 nissan altima with Rear camera, Adaptive ..."
4,toyota,land-cruiser-76-series,2020,139994,71399,Pick Up Truck,4,Manual Transmission,Gasoline,White,Dubai,2020 toyota land-cruiser-76-series with Adapti...


In [4]:
# basic information (checking for nulls)
# Cylinders attribute includes 105 null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Make          10000 non-null  object
 1   Model         10000 non-null  object
 2   Year          10000 non-null  int64 
 3   Price         10000 non-null  int64 
 4   Mileage       10000 non-null  int64 
 5   Body Type     10000 non-null  object
 6   Cylinders     9895 non-null   object
 7   Transmission  10000 non-null  object
 8   Fuel Type     10000 non-null  object
 9   Color         10000 non-null  object
 10  Location      10000 non-null  object
 11  Description   10000 non-null  object
dtypes: int64(3), object(9)
memory usage: 937.6+ KB


In [5]:
# replacing NaN in Cylinders column to mean value
cylinders_with_nan = pd.to_numeric(data['Cylinders'], errors='coerce')
mean = int(np.floor(cylinders_with_nan.mean()))
data['Cylinders'] = data['Cylinders'].fillna(mean)

In [6]:
# sanity check 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Make          10000 non-null  object
 1   Model         10000 non-null  object
 2   Year          10000 non-null  int64 
 3   Price         10000 non-null  int64 
 4   Mileage       10000 non-null  int64 
 5   Body Type     10000 non-null  object
 6   Cylinders     10000 non-null  object
 7   Transmission  10000 non-null  object
 8   Fuel Type     10000 non-null  object
 9   Color         10000 non-null  object
 10  Location      10000 non-null  object
 11  Description   10000 non-null  object
dtypes: int64(3), object(9)
memory usage: 937.6+ KB


In [7]:
# applying rank to mileage intervals (50000 interval until 200000 mileage)
data['Rank'] = data['Mileage'].apply(mileage_rank)
# converting fuel type to numeric associations: Gasoline = 2, Diesel = 1, Other = 0
data['Fuel Type'] = data['Fuel Type'].apply(lambda type: 2.0 if type == 'Gasoline' else (1.0 if type == 'Diesel' else 0.0))
# converting transmission column to binary values
data['Transmission'] = data['Transmission'].apply(lambda type: 1.0 if type == 'Automatic Transmission' else 0.0)
# dropping unneeded Location and Description attributes
data = data.drop(columns=['Location', 'Description', 'Mileage'])
data

Unnamed: 0,Make,Model,Year,Price,Body Type,Cylinders,Transmission,Fuel Type,Color,Rank
0,toyota,camry,2016,47819,Sedan,4,1.0,2.0,Black,4.0
1,kia,sorento,2013,61250,SUV,4,1.0,2.0,Grey,4.0
2,mini,cooper,2023,31861,Soft Top Convertible,4,1.0,2.0,Grey,5.0
3,nissan,altima,2016,110322,Sedan,4,1.0,2.0,Red,2.0
4,toyota,land-cruiser-76-series,2020,139994,Pick Up Truck,4,0.0,2.0,White,2.0
...,...,...,...,...,...,...,...,...,...,...
9995,tesla,model-3,2018,273413,Sedan,6,1.0,0.0,White,2.0
9996,audi,a3,2022,80053,Sedan,4,1.0,2.0,Red,5.0
9997,toyota,prado,2014,183381,SUV,6,1.0,2.0,White,2.0
9998,peugeot,expert,2016,40876,Utility Truck,4,1.0,1.0,White,5.0


In [8]:
columns = data.columns
# visual on unique values for each column
for column in columns:
    print(f"Column {column}: {len(data[column].unique())} unique values")

Column Make: 65 unique values
Column Model: 488 unique values
Column Year: 20 unique values
Column Price: 9848 unique values
Column Body Type: 13 unique values
Column Cylinders: 9 unique values
Column Transmission: 2 unique values
Column Fuel Type: 3 unique values
Column Color: 17 unique values
Column Rank: 5 unique values


In [9]:
# instantiating scaler and encoder
scaler = StandardScaler(with_mean=False)
one_hot_columns_to_convert = ['Body Type', 'Color']
standardize_columns_to_convert = ['Make', 'Model', 'Year']

In [10]:
# encoding select columns into numeric representations
columns_to_scale = list(one_hot_columns_to_convert)
for column in one_hot_columns_to_convert:
    # one hot encoding with pandas get_dummies method
    data = pd.concat([data, pd.get_dummies(data[column], dtype=float)], axis=1)
    columns_to_scale.remove(column)
    
# dropping redundant old columns
data = data.drop(columns=one_hot_columns_to_convert)


In [11]:

columns_to_scale = list(standardize_columns_to_convert)

for column in standardize_columns_to_convert:
    # encoding labels into numeric values before scaling
    data[column] = data[column].astype('category').cat.codes
    columns_to_scale.remove(column)

# using standardized scaler to transform select columns into values between -1 and 1 and centered at mean =
data[standardize_columns_to_convert] = pd.DataFrame(scaler.fit_transform(data[standardize_columns_to_convert]),columns=standardize_columns_to_convert)


In [12]:
# preprocessed set
data

Unnamed: 0,Make,Model,Year,Price,Cylinders,Transmission,Fuel Type,Rank,Coupe,Crossover,...,Grey,Orange,Other Color,Purple,Red,Silver,Tan,Teal,White,Yellow
0,3.375284,0.721931,1.899647,47819,4,1.0,2.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.659975,3.098285,1.381562,61250,4,1.0,2.0,4.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.434631,0.940014,3.108513,31861,4,1.0,2.0,5.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.600628,0.451207,1.899647,110322,4,1.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,3.375284,2.030430,2.590428,139994,4,0.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3.319951,2.248513,2.245037,273413,6,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9996,0.165998,0.376006,2.935818,80053,4,1.0,2.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9997,3.375284,2.556837,1.554257,183381,6,1.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9998,2.766626,1.338580,1.899647,40876,4,1.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
# sending fully preprocessed data and scaler to new files
data.to_csv('../data/preprocessed_data.csv')
joblib.dump(scaler, '../models/preprocessing/standard_scaler.pkl')

['../models/preprocessing/standard_scaler.pkl']