## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

%matplotlib inline

### Import Data

In [2]:
df = pd.read_csv('.\datasets\car_price_data.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
columns_to_use = ['make','model','year','engine_hp','engine_cylinders','transmission_type','vehicle_style','highway_mpg','city_mpg','msrp']

In [4]:
df = df[columns_to_use]

In [5]:
df.rename(columns = { 'msrp' : 'price' }, inplace=True) 

In [6]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [7]:
#missing values
missing_values = df.isnull().sum()
missing_values

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [8]:
# fill missing values of selected features with 0 
df = df.fillna(0)

## Question 1

In [9]:
# most frequest observation(mode) for column transmission_tyoe
mode = df.transmission_type.mode()
mode

0    AUTOMATIC
Name: transmission_type, dtype: object

## Question 2

In [10]:
# Create Correlation matrix for the numerical features
numerical = ['year','engine_hp','engine_cylinders','highway_mpg','city_mpg','price']
categorical = ['make','model','transmission_type','vehicle_style']

In [11]:
data_numeric = df[numerical]
#data_numeric = data_numeric(['year','engine_hp','engine_cylinders','highway_mpg','city_mpg','price'])
data_numeric.describe()
data_numeric.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [12]:
## convert price to binary
average_price = df.price.mean()
average_price

40594.737032063116

In [13]:
df.price = (df.price > average_price).astype(int)

In [14]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


## Setup Validation Framework

In [15]:
# split the dataset
# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
len(df_full_train), len(df_test)

(9531, 2383)

In [17]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [20]:
del df_train['price']
del df_val['price']
del df_test['price']

In [21]:
df_full_train = df_full_train.reset_index(drop=True)

In [22]:
df_full_train.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [23]:
df_full_train.price

0       1
1       1
2       0
3       0
4       0
       ..
9526    0
9527    0
9528    0
9529    1
9530    0
Name: price, Length: 9531, dtype: int32

In [24]:
df_full_train.price.value_counts()

price
0    6893
1    2638
Name: count, dtype: int64

In [25]:
df_full_train.price.value_counts(normalize=True)

price
0    0.723219
1    0.276781
Name: proportion, dtype: float64

In [26]:
global_price = round(df_full_train.price.mean(), 2)
global_price

0.28

In [27]:
df_full_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int32
dtype: object

## Question 3

In [28]:
# feature with the lowest mutal information score
def mutual_info_price_score(series):
    return mutual_info_score(series, df_full_train.price)

mi = df_full_train[categorical].apply(mutual_info_price_score)
mi.sort_values(ascending=False)
# mutual_info_score(df_full_train.price, df_full_train.

model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

## Question 4
- accuracy of the model

In [29]:
numerical

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']

In [30]:
# remove price
num = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [31]:
# Remember that we have several categorical variables in the dataset. 
# Include them using one-hot encoding.
train_dict = df_train[categorical + num].to_dict(orient='records')
train_dict[0]
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [32]:
X_train = dv.transform(train_dict)

In [33]:
X_train.shape

(7148, 943)

In [34]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=Acura', 'make=Alfa Romeo', 'make=Aston Martin', 'make=Audi',
       'make=BMW', 'make=Bentley', 'make=Bugatti', 'make=Buick',
       'make=Cadillac', 'make=Chevrolet', 'make=Chrysler', 'make=Dodge',
       'make=FIAT', 'make=Ferrari', 'make=Ford', 'make=GMC',
       'make=Genesis', 'make=HUMMER', 'make=Honda', 'make=Hyundai',
       'make=Infiniti', 'make=Kia', 'make=Lamborghini', 'make=Land Rover',
       'make=Lexus', 'make=Lincoln', 'make=Lotus', 'make=Maserati',
       'make=Maybach', 'make=Mazda', 'make=McLaren', 'make=Mercedes-Benz',
       'make=Mitsubishi', 'make=Nissan', 'make=Oldsmobile',
       'make=Plymouth', 'make=Pontiac', 'make=Porsche',
       'make=Rolls-Royce', 'make=Saab', 'make=Scion', 'make=Spyker',
       'make=Subaru', 'make=Suzuki', 'make=Tesla', 'make=Toyota',
       'make=Volkswagen', 'make=Volvo', 'model=1 Series', 'model=100',
       'model=124 Spider', 'model=190-Class', 'model

In [35]:
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [36]:
val_dict = df_val[categorical + num].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [37]:
model.predict_proba(X_val)

array([[9.99154573e-01, 8.45426579e-04],
       [3.65807694e-03, 9.96341923e-01],
       [9.99850183e-01, 1.49816720e-04],
       ...,
       [9.99737589e-01, 2.62411354e-04],
       [1.02990295e-02, 9.89700971e-01],
       [1.23799943e-02, 9.87620006e-01]])

In [38]:
y_pred = model.predict_proba(X_val)[:,1]

In [39]:
y_pred

array([8.45426579e-04, 9.96341923e-01, 1.49816720e-04, ...,
       2.62411354e-04, 9.89700971e-01, 9.87620006e-01])

In [40]:
(y_val == average_price).mean()

0.0

In [41]:
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
df_full_train[numerical].corrwith(df_full_train.price)

year                0.318753
engine_hp           0.660670
engine_cylinders    0.453162
highway_mpg        -0.134484
city_mpg           -0.157912
price               1.000000
dtype: float64