In [None]:
#!pip install scikit-learn

In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-02 11:15:59--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-10-02 11:15:59 (5.00 MB/s) - ‘data.csv’ saved [1475504/1475504]



In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Read and prep data

In [164]:
df = pd.read_csv('data.csv')

In [165]:
df.head().T

Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance","Luxury,Performance",Luxury


In [166]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [167]:
df = df[['make',
         'model',
         'year',
         'engine_hp',
         'engine_cylinders',
         'transmission_type',
         'vehicle_style',
         'highway_mpg',
         'city_mpg',
         'msrp']]

In [168]:
df.isnull().any()

make                 False
model                False
year                 False
engine_hp             True
engine_cylinders      True
transmission_type    False
vehicle_style        False
highway_mpg          False
city_mpg             False
msrp                 False
dtype: bool

In [169]:
df.engine_hp = df.engine_hp.fillna(0)
df.engine_cylinders = df.engine_cylinders.fillna(0)

In [170]:
df = df.rename(columns={'msrp': 'price'})

### Question 1

What is the most frequent observation (mode) for the column `transmission_type`?

- **`AUTOMATIC` <--**
- `MANUAL`
- `AUTOMATED_MANUAL`
- `DIRECT_DRIVE`

In [171]:
df.transmission_type.value_counts(normalize=False)

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

- `engine_hp` and `year`
- `engine_hp` and `engine_cylinders`
- `highway_mpg` and `engine_cylinders`
- **`highway_mpg` and `city_mpg` <--**


In [172]:
numerical = ['year',
             'engine_hp',
             'engine_cylinders',
             'highway_mpg',
             'city_mpg',
             'price']

categorical = ['make',
               'model',
               'transmission_type',
               'vehicle_style']

In [173]:
matrix = pd.DataFrame()
for n in numerical:
    col = pd.DataFrame(df[numerical].corrwith(df[n]), columns = [n])
    matrix = pd.concat([matrix, col], axis=1)

In [174]:
print("engine_hp and year", matrix.loc['engine_hp','year'])
print("engine_hp and engine_cylinders", matrix.loc['engine_hp','engine_cylinders'])
print("highway_mpg and engine_cylinders", matrix.loc['highway_mpg','engine_cylinders'])
print("highway_mpg and city_mpg", matrix.loc['highway_mpg','city_mpg'])

engine_hp and year 0.3387141847624468
engine_hp and engine_cylinders 0.7748509807813194
highway_mpg and engine_cylinders -0.6145414173953327
highway_mpg and city_mpg 0.8868294962591357


### Make `price` binary

* Now we need to turn the `price` variable from numeric into a binary format.
* Let's create a variable `above_average` which is `1` if the `price` is above its mean value and `0` otherwise.


In [175]:
mean_price = df['price'].mean()

In [176]:
mean_price

40594.737032063116

In [177]:
## Create a new column 'above_average' based on the condition
df['above_average'] = df['price'] > mean_price

## Convert True/False to 1/0
df['above_average'] = df['above_average'].astype(int)

In [178]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value (`price`) is not in your dataframe.

In [179]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

In [180]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [181]:
## Assign Y variable as 'above_average' for each set - do not use 'price' as target variable
y_train = df_train.above_average.values
y_test = df_test.above_average.values
y_val = df_val.above_average.values

In [182]:
del df_train['price']
del df_test['price']
del df_val['price']

In [183]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

### Question 3

* Calculate the mutual information score between `above_average` and other categorical variables in our dataset. 
  Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the lowest mutual information score?
  
- `make`
- `model`
- **`transmission_type` <--**
- `vehicle_style`


In [184]:
from sklearn.metrics import mutual_info_score

In [185]:
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [186]:
for c in categorical:
    print(c, round(mutual_info_score(df_full_train.above_average, df_full_train[c]), 2))

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.60
- 0.72
- 0.84
- **0.95 <--**

In [187]:
from sklearn.feature_extraction import DictVectorizer

In [188]:
numerical = ['year',
             'engine_hp',
             'engine_cylinders',
             'highway_mpg',
             'city_mpg']

categorical = ['make',
               'model',
               'transmission_type',
               'vehicle_style']

In [189]:
dv = DictVectorizer(sparse=False) #Initializing
train_dict = df_train[categorical + numerical].to_dict(orient='records') # Turn categorical and numerical variables into a dictionary 
X_train = dv.fit_transform(train_dict) # Teach the vectorizer what values are present

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [190]:
from sklearn.linear_model import LogisticRegression

In [191]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train) # Fitting the model 

In [192]:
y_pred = model.predict_proba(X_val)[:, 1] # Returns the probability of price being > average
y_price_above_mean = (y_pred >= 0.5) # 0.5 threshold returns binary predictions

In [193]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred # "Soft predictions" - probability that the price is above mean
df_pred['prediction'] = y_price_above_mean.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual

accuracy = df_pred.correct.mean().round(2) # Accuracy

In [194]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [195]:
df_pred.head()

Unnamed: 0,probability,prediction,actual,correct
0,0.008735,0,0,True
1,0.990333,1,1,True
2,0.001575,0,0,True
3,0.403872,0,0,True
4,0.003179,0,0,True


In [196]:
accuracy = df_pred.correct.mean().round(2) # Accuracy

In [197]:
accuracy

0.93

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `year`
- **`engine_hp` <--**
- `transmission_type`
- `city_mpg`

> **Note**: the difference doesn't have to be positive

In [198]:
orig_score = accuracy
features = categorical + numerical

for f in features:
    dv = DictVectorizer(sparse=False) #Initializing
    sub = features.copy()
    sub.remove(f)

    train_dict = df_train[sub].to_dict(orient='records') # Turn categorical and numerical variables into a dictionary 
    X_train = dv.fit_transform(train_dict) # Teach the vectorizer what values are present

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train) # Fitting the model 
    
    val_dict = df_val[sub].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict_proba(X_val)[:, 1] # Returns the probability of price being > average
    y_price_above_mean = (y_pred >= 0.5) # 0.5 threshold returns binary predictions

    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred # "Soft predictions" - probability that the price is above mean
    df_pred['prediction'] = y_price_above_mean.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual

    subset_score = df_pred.correct.mean() # Accuracy
    print(f, orig_score - subset_score, subset_score)

make -0.015027276542173729 0.9450272765421738
model 0.005954678976080596 0.9240453210239195
transmission_type -0.015866554762903884 0.9458665547629039
vehicle_style -0.0028577423415862002 0.9328577423415862
year -0.017545111204364194 0.9475451112043642
engine_hp 7.973143096939861e-05 0.9299202685690307
engine_cylinders -0.015866554762903884 0.9458665547629039
highway_mpg -0.01670583298363404 0.9467058329836341
city_mpg -0.015866554762903884 0.9458665547629039


### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver `'sag'`. Set the seed to `42`.
* This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
* Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

In [199]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [200]:
df['log_price'] = np.log(df['price'])

In [201]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

## Assign Y variable as 'above_average' for each set - do not use 'price' as target variable
y_train = df_train.log_price.values
y_test = df_test.log_price.values
y_val = df_val.log_price.values

len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

In [202]:
numerical = ['year',
             'engine_hp',
             'engine_cylinders',
             'highway_mpg',
             'city_mpg']

categorical = ['make',
               'model',
               'transmission_type',
               'vehicle_style']

In [203]:
# RMSE 
def rmse(y, y_pred):
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [204]:
dv = DictVectorizer(sparse=False) #Initializing
train_dict = df_train[categorical + numerical].to_dict(orient='records') # Turn categorical and numerical variables into a dictionary 
X_train = dv.fit_transform(train_dict) # Teach the vectorizer what values are present

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [162]:
for a in [0.0, 0.01, 0.1, 1, 10]:
    model = Ridge(solver='sag', max_iter=1000, random_state=42, alpha=a)
    model.fit(X_train, y_train) # Fitting the model 
    y_pred = model.predict(X_val)
    score = round(rmse(y_val, y_pred), 3)
    print(a, score)
    



0.0 0.487




0.01 0.487




0.1 0.487




1 0.487
10 0.487


