In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import mean_squared_error, mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
!wget $data

--2023-09-30 02:51:48--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-09-30 02:51:48 (42.6 MB/s) - ‘data.csv’ saved [1475504/1475504]



# Data prep

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [4]:
df_cols = [
    "Make",
    "Model",
    "Year",
    "Engine HP",
    "Engine Cylinders",
    "Transmission Type",
    "Vehicle Style",
    "highway MPG",
    "city mpg",
    "MSRP",
    ]

In [5]:
def tweak_df(table: pd.DataFrame) -> pd.DataFrame:
    table = table.loc[:, df_cols].copy()
    table.columns = table.columns.str.replace(' ', '_').str.lower()
    table = (table
        .rename(columns={"msrp": "price"})
        .fillna(0)
        )
    return table

In [6]:
df = tweak_df(df)

In [7]:
sorted(df)

['city_mpg',
 'engine_cylinders',
 'engine_hp',
 'highway_mpg',
 'make',
 'model',
 'price',
 'transmission_type',
 'vehicle_style',
 'year']

In [8]:
df.shape

(11914, 10)

In [9]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [10]:
df.head(2).T

Unnamed: 0,0,1
make,BMW,BMW
model,1 Series M,1 Series
year,2011,2011
engine_hp,335.0,300.0
engine_cylinders,6.0,6.0
transmission_type,MANUAL,MANUAL
vehicle_style,Coupe,Convertible
highway_mpg,26,28
city_mpg,19,19
price,46135,40650


In [11]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

# Question 1
What is the most frequent observation (mode) for the column transmission_type?

* AUTOMATIC
* MANUAL
* AUTOMATED_MANUAL
* DIRECT_DRIVE

In [12]:
df_mode = df.transmission_type.mode()
df_mode

0    AUTOMATIC
Name: transmission_type, dtype: object

In [13]:
print(f"The most frequent transmission type is {df_mode.values[0]}.")

The most frequent transmission type is AUTOMATIC.


# Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

* engine_hp and year
* engine_hp and engine_cylinders
* highway_mpg and engine_cylinders
* highway_mpg and city_mpg

In [14]:
df[["engine_hp"]].corrwith(df["price"])

engine_hp    0.650095
dtype: float64

In [15]:
df[["engine_hp"]].corrwith(df["engine_cylinders"])

engine_hp    0.774851
dtype: float64

In [16]:
df[["highway_mpg"]].corrwith(df["engine_cylinders"])

highway_mpg   -0.614541
dtype: float64

In [17]:
df[["highway_mpg"]].corrwith(df["city_mpg"])

highway_mpg    0.886829
dtype: float64

In [18]:
print("The features with biggest correlation are highway_mpg and city_mpg.")

The features with biggest correlation are highway_mpg and city_mpg.


# Make price binary
Now we need to turn the price variable from numeric into a binary format.

Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [19]:
df["above_average"] = (df["price"] > df["price"].mean()).astype(int)

# Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution

Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

Make sure that the target value (above_average) is not in your dataframe.

In [20]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)

In [21]:
df_train.shape[0] + df_val.shape[0] + df_test.shape[0] == df.shape[0]

True

In [22]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [23]:
y_train = df_train["above_average"].values
y_val = df_val["above_average"].values
y_test = df_test["above_average"].values

#For Q6
y_train6 = df_train["price"].values
y_val6 = df_val["price"].values
y_test6 = df_test["price"].values

In [24]:
del df_train["above_average"], df_train["price"], df_val["above_average"], df_val["price"], df_test["above_average"], df_test["price"]

In [25]:
proportion = df_full_train.above_average.value_counts(normalize=True)
print(f"The above_average rate is approximately {proportion[1]:.2%}.")

The above_average rate is approximately 27.68%.


# Question 3
Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.

Round the scores to 2 decimals using round(score, 2).

Which of these variables has the lowest mutual information score?

* make
* model
* transmission_type
* vehicle_style

In [26]:
categorical_columns = list(df_train.select_dtypes("object"))
numerical_columns = list(df_train.select_dtypes("number"))

In [27]:
def mutual_above_average_score(series: pd.Series) -> pd.Series:
    score = mutual_info_score(series, y_train)
    return round(score, 2)

In [28]:
mi = df_train[categorical_columns].apply(mutual_above_average_score)
mi.sort_values()

transmission_type    0.02
vehicle_style        0.08
make                 0.24
model                0.46
dtype: float64

In [29]:
print("transmission_type is the variable with the lowest mutual information score.")

transmission_type is the variable with the lowest mutual information score.


# Question 4
Now let's train a logistic regression.

Remember that we have several categorical variables in the dataset. 

Include them using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

```model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)```

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

* 0.60
* 0.72
* 0.84
* 0.95

In [30]:
train_dicts = df_train.loc[:, categorical_columns + numerical_columns].to_dict(orient='records')
val_dicts = df_val.loc[:, categorical_columns + numerical_columns].to_dict(orient='records')

In [31]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [32]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [33]:
model.fit(X_train, y_train)

In [34]:
y_pred = model.predict_proba(X_val)[:, 1]

In [35]:
high_price = (y_pred >= 0.5)
round((high_price == y_val).mean(), 2)

0.93

# Question 5
Let's find the least useful feature using the feature elimination technique.

Train a model with all these features (using the same parameters as in Q4).

Now exclude each feature from this set and train a model without it. 

Record the accuracy for each model.

For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

* year
* engine_hp
* transmission_type
* city_mpg

*Note: the difference doesn't have to be positive*

In [36]:
features = categorical_columns + numerical_columns

accuracy_diff = {}

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
accuracy_all = (y_pred == y_val).mean()
print(f"Accuracy with all features: {accuracy_all}")

for feature in features:
    features_to_exclude = [col for col in features if col != feature]
    _features = [i for i, col in enumerate(dv.get_feature_names_out()) if col in features_to_exclude]
    X_train_excluded = X_train[:, _features]
    X_val_excluded = X_val[:, _features]
    
    model.fit(X_train_excluded, y_train)
    y_pred_excluded = model.predict(X_val_excluded)
    
    accuracy_excluded = (y_pred_excluded == y_val).mean()
    
    accuracy_diff[feature] = accuracy_all - accuracy_excluded

print("Accuracy differences for each feature:")
for feature, diff in accuracy_diff.items():
    print(f"{feature}: {diff}")

print("\nThe feature with the smallest difference is: engine_hp.")

Accuracy with all features: 0.0


Accuracy differences for each feature:
make: -0.8772941793392763
model: -0.8772941793392763
transmission_type: -0.8772941793392763
vehicle_style: -0.8772941793392763
year: -0.8778185631882538
engine_hp: -0.7933927635028841
engine_cylinders: -0.8793917147351862
highway_mpg: -0.8730991085474568
city_mpg: -0.8793917147351862

The feature with the smallest difference is: engine_hp.


# Question 6

For this question, we'll see how to use a linear regression model from Scikit-Learn.

We'll need to use the original column price. Apply the logarithmic transformation to this column.

Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.

This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].

Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

* 0
* 0.01
* 0.1
* 1
* 10

*Note: If there are multiple options, select the smallest alpha.*

In [43]:
y_train_log = np.log1p(y_train6)
y_val_log = np.log1p(y_val6)
y_test_log = np.log1p(y_test6)

In [41]:
alphas = [0, 0.01, 0.1, 1, 10]

rmse_scores = {}

for alpha in alphas:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train_log)
    
    y_pred_val_log = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val_log, y_pred_val_log))
    rmse = round(rmse, 3)
    rmse_scores[alpha] = rmse

print("RMSE for each alpha:")
for alpha, rmse in rmse_scores.items():
    print(f"Alpha {alpha}: {rmse}")

best_alpha = min(rmse_scores, key=rmse_scores.get)
print("The best alpha is:", best_alpha)



RMSE for each alpha:
Alpha 0: 0.483
Alpha 0.01: 0.483
Alpha 0.1: 0.483
Alpha 1: 0.483
Alpha 10: 0.483
The best alpha is: 0




In [42]:
print("this one was not clear, did we have a lesson with Ridge Regression?")

this one was not clear, did we have a lesson with Ridge Regression?
