In [1]:
# Download the dataset
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-09-30 15:51:53--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8003::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1,4M) [text/plain]
Saving to: ‘data.csv’


2023-09-30 15:51:54 (16,0 MB/s) - ‘data.csv’ saved [1475504/1475504]



In [84]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import mutual_info_score

from sklearn.linear_model import LogisticRegression


In [3]:
car_prices_df = pd.read_csv("data.csv")

In [4]:
car_prices_df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [10]:
# Features used 
cols = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style',
        'highway MPG', 'city mpg']

target = ['MSRP']

car_prices_df2 = car_prices_df.copy()
car_prices_df2 = car_prices_df2[cols + target]

In [11]:
# Data preparation
car_prices_df2.columns = car_prices_df2.columns.str.replace(" ", "_").str.lower()

In [15]:
cols = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']

In [12]:
car_prices_df2.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [16]:
car_prices_df2[cols] = car_prices_df2[cols].fillna(0) 

print(car_prices_df2.isna().sum())

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64


In [19]:
# Most frequent observation for the column transmission_type
car_prices_df2["transmission_type"].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [21]:
car_prices_df2.head(2)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650


In [32]:
corr_matrix = car_prices_df2[["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "msrp"]].corr(method="pearson")

corr_matrix_arr = corr_matrix.values
np.fill_diagonal(corr_matrix_arr, -1000)

max_pos = np.where(corr_matrix_arr == corr_matrix_arr.max())
max_pos[0], max_pos[1] # Position of the maximum correlation value

(array([3, 4]), array([4, 3]))

In [36]:
# Features that have the biggest correlation: highway_mpg and city_mpg
corr_matrix.iloc[max_pos[0], max_pos[1]]

Unnamed: 0,city_mpg,highway_mpg
highway_mpg,0.886829,-1000.0
city_mpg,-1000.0,0.886829


In [60]:
# Making price binary
mean_price = car_prices_df2["msrp"].mean()

above_average = car_prices_df2["msrp"].map(lambda value_: 1 if value_ >= mean_price else 0)

car_prices_df3 = car_prices_df2.copy()
car_prices_df3["above_average"] = above_average

del car_prices_df3["msrp"]

In [61]:
car_prices_df3.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


In [62]:
full_train_df, test_df = train_test_split(car_prices_df3, test_size=0.2, random_state=42)

In [63]:
2/8

0.25

In [64]:
train_df, val_df = train_test_split(full_train_df, test_size=0.25, random_state=42)

In [65]:
test_df.shape, test_df.shape, train_df.shape, val_df.shape

((2383, 10), (2383, 10), (7148, 10), (2383, 10))

In [66]:
y_train = train_df["above_average"]
y_val = val_df["above_average"]

In [67]:
del train_df["above_average"]
del val_df["above_average"]

In [73]:
def calculate_mi(series):
    return mutual_info_score(series, y_train)

In [70]:
train_df.head(3)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15
1997,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17
5216,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12


In [78]:
categorical_cols = ["make", "model", "transmission_type", "vehicle_style"]
numerical_cols = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]

In [74]:
mi_df = train_df[categorical_cols].apply(calculate_mi)
mi_df = mi_df.sort_values(ascending=False).to_frame(name='MI')

In [76]:
# Lowest mutual information
mi_df.round(2)

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


In [79]:
# One-hot enconding
train_dict = train_df[categorical_cols + numerical_cols].to_dict(orient='records')

In [80]:
train_dict[0]

{'make': 'Mitsubishi',
 'model': 'Endeavor',
 'transmission_type': 'AUTOMATIC',
 'vehicle_style': '4dr SUV',
 'year': 2011,
 'engine_hp': 225.0,
 'engine_cylinders': 6.0,
 'highway_mpg': 19,
 'city_mpg': 15}

In [81]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)


In [82]:
X_train = dv.transform(train_dict)

In [83]:
X_train.shape

(7148, 943)

In [86]:
# Training a Logistic Regression
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [90]:
# Predicting on the val set
val_dict = val_df[categorical_cols + numerical_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
above_avg = y_pred >= 0.5

print(f"Accuracy: {round((y_val == above_avg).mean(), 2)}")


Accuracy: 0.95


In [105]:
# Finding the least useful feature
all_features = categorical_cols + numerical_cols
# n_features = len(all_features)

acc_dict = {}
# while n_features > 0:
for feature in all_features:
    # features_subset = all_features[:n_features]
    features_subset = [f for f in all_features if f != feature]
    print(f"Features used: {features_subset}. \nFeature excluded: {feature}\n\n")

    train_dict = train_df[features_subset].to_dict(orient='records')
    val_dict = val_df[features_subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    above_avg = y_pred >= 0.5

    currently_acc = (y_val == above_avg).mean()

    # acc_dict.update({str(n_features): (features_subset, currently_acc)})
    acc_dict.update({feature: currently_acc})


    # n_features = n_features - 1

print("Finished")

Features used: ['model', 'transmission_type', 'vehicle_style', 'year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']. 
Feature excluded: make


Features used: ['make', 'transmission_type', 'vehicle_style', 'year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']. 
Feature excluded: model


Features used: ['make', 'model', 'vehicle_style', 'year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']. 
Feature excluded: transmission_type


Features used: ['make', 'model', 'transmission_type', 'year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']. 
Feature excluded: vehicle_style


Features used: ['make', 'model', 'transmission_type', 'vehicle_style', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']. 
Feature excluded: year


Features used: ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']. 
Feature excluded: engine_hp


Features used: ['make', 'model', 'transmission_type'

In [107]:
aux_acc_dict = {}

for feature, acc in acc_dict.items():
    print(f"Acc without the feature {feature}: {acc}")
    diff = 0.95 - acc
    print(f"Accuracy diff: {diff}")
    aux_acc_dict.update({feature: diff})
    print()

Acc without the feature make: 0.9467058329836341
Accuracy diff: 0.0032941670163658676

Acc without the feature model: 0.9194292908099034
Accuracy diff: 0.030570709190096523

Acc without the feature transmission_type: 0.9404112463281578
Accuracy diff: 0.009588753671842198

Acc without the feature vehicle_style: 0.9320184641208561
Accuracy diff: 0.017981535879143862

Acc without the feature year: 0.9471254720939991
Accuracy diff: 0.0028745279060008455

Acc without the feature engine_hp: 0.9227864036928242
Accuracy diff: 0.02721359630717579

Acc without the feature engine_cylinders: 0.9454469156525388
Accuracy diff: 0.004553084347461156

Acc without the feature highway_mpg: 0.9467058329836341
Accuracy diff: 0.0032941670163658676

Acc without the feature city_mpg: 0.9458665547629039
Accuracy diff: 0.004133445237096023



In [108]:
sorted(aux_acc_dict.items(), key=lambda x : x[1])

[('year', 0.0028745279060008455),
 ('make', 0.0032941670163658676),
 ('highway_mpg', 0.0032941670163658676),
 ('city_mpg', 0.004133445237096023),
 ('engine_cylinders', 0.004553084347461156),
 ('transmission_type', 0.009588753671842198),
 ('vehicle_style', 0.017981535879143862),
 ('engine_hp', 0.02721359630717579),
 ('model', 0.030570709190096523)]