In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge


In [2]:
car_prices_df = pd.read_csv("data.csv")

# Features used 
cols = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style',
        'highway MPG', 'city mpg']

target = ['MSRP']

car_prices_df2 = car_prices_df.copy()
car_prices_df2 = car_prices_df2[cols + target]

# Data preparation
car_prices_df2.columns = car_prices_df2.columns.str.replace(" ", "_").str.lower()
cols = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']

car_prices_df2[cols] = car_prices_df2[cols].fillna(0)

In [3]:
full_train_df, test_df = train_test_split(car_prices_df2, test_size=0.2, random_state=42)

train_df, val_df = train_test_split(full_train_df, test_size=0.25, random_state=42)

In [4]:
y_train = train_df["msrp"]
y_val = val_df["msrp"]

del train_df["msrp"]
del val_df["msrp"]

categorical_cols = ["make", "model", "transmission_type", "vehicle_style"]
numerical_cols = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]

# One-hot enconding
train_dict = train_df[categorical_cols + numerical_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = val_df[categorical_cols + numerical_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Applying the log1p
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [5]:
# Training a Logistic Regression
for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alpha, solver="sag", random_state=42)
    model.fit(X_train, y_train_log)
    
    # Predicting on the val set
    y_pred = model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val_log, y_pred))

    print(f"{alpha}: rmse = {rmse}")
    print()

    