## 3. Classification

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

## 2.2 Data preparation

In [2]:
file_path = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [3]:
df = pd.read_csv(file_path)

In [4]:
# Check dataframe
df.sample(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
11438,Ford,Windstar,2002,regular unleaded,200.0,6.0,AUTOMATIC,front wheel drive,4.0,,Midsize,Passenger Minivan,21,16,5657,26375
145,BMW,3 Series,2016,premium unleaded (required),320.0,6.0,AUTOMATIC,all wheel drive,4.0,"Luxury,High-Performance",Midsize,Sedan,33,22,3916,47800
4177,Cadillac,Escalade,2017,premium unleaded (recommended),420.0,8.0,AUTOMATIC,four wheel drive,4.0,"Luxury,Performance",Large,4dr SUV,20,15,1624,75995
2088,Mercedes-Benz,C-Class,2017,premium unleaded (required),241.0,4.0,AUTOMATIC,all wheel drive,4.0,Luxury,Midsize,Sedan,31,24,617,43675
3620,Dodge,Durango,2016,regular unleaded,293.0,6.0,AUTOMATIC,rear wheel drive,4.0,"Crossover,Performance",Large,4dr SUV,27,19,1851,30495
1839,Volkswagen,Beetle Convertible,2015,diesel,150.0,4.0,MANUAL,front wheel drive,2.0,Diesel,Compact,Convertible,40,30,873,29095
3909,Honda,Element,2011,regular unleaded,166.0,4.0,AUTOMATIC,all wheel drive,4.0,Crossover,Compact,4dr SUV,24,19,2202,22075
6092,Volkswagen,Jetta SportWagen,2014,diesel,140.0,4.0,AUTOMATED_MANUAL,front wheel drive,4.0,Diesel,Compact,Wagon,39,29,873,29465
2524,Chevrolet,City Express,2016,regular unleaded,131.0,4.0,AUTOMATIC,front wheel drive,4.0,,Compact,Cargo Minivan,26,24,1385,23515
9528,Chevrolet,Silverado 1500,2015,regular unleaded,355.0,8.0,AUTOMATIC,rear wheel drive,4.0,,Large,Extended Cab Pickup,23,16,1385,40100


In [5]:
# Define columns we want to keep
columns_to_keep = [
'Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP'
]

# Keep only required columns for the project
df_filtered = df[columns_to_keep]

In [6]:
# Check missing values
missing_values = df_filtered.isnull().sum()
print(missing_values)

Make                  0
Model                 0
Year                  0
Engine HP            69
Engine Cylinders     30
Transmission Type     0
Vehicle Style         0
highway MPG           0
city mpg              0
MSRP                  0
dtype: int64


In [7]:
df_filtered = df_filtered.fillna(0)

In [8]:
df_filtered.columns = df_filtered.columns.str.replace(' ', '_').str.lower()

In [9]:
df_filtered.rename(columns={'msrp': 'price'}, inplace=True)

## Question 1

In [10]:
# The most frequent observation (mode) for the column transmission_type
df_filtered['transmission_type'].mode().iloc[0]

'AUTOMATIC'

## Question 2

In [11]:
correlation_matrix = df_filtered.corr()
correlation_matrix

  correlation_matrix = df_filtered.corr()


Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [12]:
np.fill_diagonal(correlation_matrix.values, 0)
correlation_matrix.unstack().idxmax()

('highway_mpg', 'city_mpg')

### Make price binary

In [13]:
mean_price = df_filtered['price'].mean()
mean_price

40594.737032063116

In [14]:
df_filtered['above_average'] = np.where(df_filtered['price'] > mean_price, 1, 0)

### Split the data

In [15]:
df_full_train, df_test = train_test_split(df_filtered, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
df_train

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15,33599,0
1,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17,26245,0
2,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12,248000,1
3,Chevrolet,Colorado,2016,200.0,4.0,AUTOMATIC,Crew Cab Pickup,27,20,24990,0
4,Pontiac,Vibe,2009,158.0,4.0,AUTOMATIC,4dr Hatchback,26,20,20475,0
...,...,...,...,...,...,...,...,...,...,...,...
7143,Toyota,Sienna,2016,266.0,6.0,AUTOMATIC,Passenger Minivan,25,18,37655,0
7144,Chevrolet,HHR,2009,260.0,4.0,MANUAL,Wagon,29,21,25135,0
7145,Hyundai,Veracruz,2012,260.0,6.0,AUTOMATIC,4dr SUV,22,17,28345,0
7146,Mitsubishi,Expo,1993,136.0,4.0,MANUAL,2dr Hatchback,26,19,2000,0


In [18]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

df_train.drop(['above_average', 'price'], axis=1, inplace=True)

df_val.drop(['above_average', 'price'], axis=1, inplace=True)
df_test.drop(['above_average', 'price'], axis=1, inplace=True)


In [19]:
print(f'Training set: {df_train.shape[0]} samples')
print(f'Validation set: {df_val.shape[0]} samples')
print(f'Test set: {df_test.shape[0]} samples')

Training set: 7148 samples
Validation set: 2383 samples
Test set: 2383 samples


## Question 3

In [20]:
categorical = [
    'make',
    'model',
    'vehicle_style',
    'transmission_type'
]

In [21]:
numerical = [
    'year',
    'engine_hp',
    'highway_mpg',
    'city_mpg',
    'engine_cylinders']

In [22]:
def mutual_info_price_score(series):
    return mutual_info_score(series, df_full_train['above_average'])

In [23]:
mi = df_full_train[categorical].apply(mutual_info_price_score)
mi_sorted = mi.sort_values(ascending=False)
mi_sorted

model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

## Question 4

### One-hot encoding

In [24]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [25]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_val)

In [27]:
acc_all = accuracy_score(y_val, y_pred)

acc_all_rounded = round(acc_all, 2)

In [28]:
acc_all_rounded

0.93

In [29]:
df_train.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

## Question 5

In [30]:
features = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
features

['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg']

In [31]:
model_params = {'solver': 'liblinear', 'C': 10, 'max_iter': 1000, 'random_state': 42}

acc_diff = {}

for idx, feature in enumerate(features): 
    X_train_subset = np.delete(X_train, idx, axis=1)
    X_val_subset = np.delete(X_val, idx, axis=1)
    
    model = LogisticRegression(**model_params)
    model.fit(X_train_subset, y_train)
    
    y_pred = model.predict(X_val_subset)
    acc_subset = accuracy_score(y_val, y_pred)
    
    acc_diff[feature] = acc_all - acc_subset
    print(f"feature {feature}")
    print(f"accuracy score of the subset {acc_subset}")

least_useful_feature = min(acc_diff, key=acc_diff.get)
print(f"original accuracy {acc_all}")
print(f"least useful feature {least_useful_feature}")

feature make
accuracy score of the subset 0.946286193873269
feature model
accuracy score of the subset 0.946286193873269
feature year
accuracy score of the subset 0.9236256819135543
feature engine_hp
accuracy score of the subset 0.9412505245488879
feature engine_cylinders
accuracy score of the subset 0.9454469156525388
feature transmission_type
accuracy score of the subset 0.9450272765421738
feature vehicle_style
accuracy score of the subset 0.9395719681074276
feature highway_mpg
accuracy score of the subset 0.946286193873269
feature city_mpg
accuracy score of the subset 0.9458665547629039
original accuracy 0.9345362987830466
least useful feature make


## Question 6

In [32]:
df_filtered.drop(['above_average'], axis=1, inplace=True)

In [33]:
df_filtered['price'] =  np.log1p(df_filtered['price'])


In [34]:
df_full_train, df_test = train_test_split(df_filtered, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [35]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [36]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

df_train.drop(['price'], axis=1, inplace=True)
df_val.drop(['price'], axis=1, inplace=True)
df_test.drop(['price'], axis=1, inplace=True)


In [37]:
print(f'Training set: {df_train.shape[0]} samples')
print(f'Validation set: {df_val.shape[0]} samples')
print(f'Test set: {df_test.shape[0]} samples')

Training set: 7148 samples
Validation set: 2383 samples
Test set: 2383 samples


In [41]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [42]:
alphas = [0, 0.01, 0.1, 1, 10]

In [43]:
best_alpha = None
best_rmse = np.inf

In [None]:
for alpha in alphas:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    predictions_val = model.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, predictions_val))
    
    if rmse < best_rmse:
        best_alpha = alpha
        best_rmse = rmse
        
    print(f'Alpha: {alpha}, RMSE: {round(rmse,7)}')
    
print(f'Best alpha: {best_alpha}, Best RMSE: {round(best_rmse,3)}')



Alpha: 0, RMSE: 0.4867943




Alpha: 0.01, RMSE: 0.4867946




Alpha: 0.1, RMSE: 0.4867967


