In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-02-car-price/data.csv'

In [3]:
import os
if not os.path.exists('data.csv'):
    !wget $data
!ls

WA_Fn-UseC_-Telco-Customer-Churn.csv  churn-prediction.ipynb
car_price_new.ipynb		      data.csv


In [4]:
df = pd.read_csv('./data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical = list(df.select_dtypes(include=['object']).columns)
categorical

['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style']

In [7]:
numerical = list(df.select_dtypes(include=['int64', 'float64']).columns)[:-1]
numerical

['year',
 'engine_hp',
 'engine_cylinders',
 'number_of_doors',
 'highway_mpg',
 'city_mpg',
 'popularity']

In [8]:
for col in categorical:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [9]:
df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,bmw,1_series_m,2011,premium_unleaded_(required),335.0,6.0,manual,rear_wheel_drive,2.0,"factory_tuner,luxury,high-performance",compact,coupe,26,19,3916,46135
1,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,convertible,28,19,3916,40650
2,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,high-performance",compact,coupe,28,20,3916,36350
3,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,coupe,28,18,3916,29450
4,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,luxury,compact,convertible,28,18,3916,34500


In [10]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [11]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train),len(df_val), len(df_test)

(7148, 2383, 2383)

In [12]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = np.log1p(df_train.msrp.values)
y_test = np.log1p(df_test.msrp.values)
y_val = np.log1p(df_val.msrp.values)

In [13]:
scaler = StandardScaler()
# StandardScaler?
# MinMaxScaler?

In [14]:
X_train_num = scaler.fit_transform(df_train[numerical].fillna(0))
X_train_num

array([[ 0.74687366,  0.28819104,  0.20885012, ...,  0.15277794,
         0.80060489,  0.31997739],
       [ 0.48514933,  0.46801719,  0.20885012, ...,  0.36874881,
        -0.18643089, -0.12582841],
       [ 0.74687366,  0.09937357, -0.89452064, ...,  0.0447925 ,
        -0.07676024, -0.97465369],
       ...,
       [ 0.87773583, -1.06050513, -0.89452064, ...,  1.12464685,
         1.3489581 ,  0.4379848 ],
       [-2.5246805 , -0.77278328, -0.34283526, ..., -0.71110554,
        -0.40577217,  1.06114678],
       [-0.9543345 , -0.07146127,  0.20885012, ..., -0.92707641,
        -0.62511345, -0.95050013]])

In [15]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# OneHotEncoder?

In [16]:
X_train_cat = ohe.fit_transform(df_train[categorical].fillna('unknown'))
X_train_cat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
X_train = np.column_stack([X_train_cat, X_train_num])

In [18]:
X_val_num = scaler.transform(df_val[numerical].fillna(0))
X_val_cat = ohe.transform(df_val[categorical].fillna('unknown'))
X_val = np.column_stack([X_val_num, X_val_cat])

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [28]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [21]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [22]:
display(model.coef_)
display(model.intercept_)

array([-0.16046413,  0.30627226,  0.22958128, ...,  0.00156877,
       -0.0493167 , -0.12822999])

np.float64(10.55178645517414)

In [23]:
y_pred = model.predict(X_val)
y_pred

array([13.62165868, 10.76516996,  8.03515428, ...,  7.35770665,
       12.24120875, 11.48151605])

In [24]:
rmse(y_val, y_pred)

np.float64(2.1603901566820767)

In [29]:
for r in [0.0000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0, 1, 10]:
    model_reg = Ridge(alpha=r, random_state=1)
    model_reg.fit(X_train, y_train)
 
    y_pred = model_reg.predict(X_val)
 
    score = rmse(y_val, y_pred)
     
    print("reg parameter: ",r, "bias term: ",model_reg.intercept_, "rmse: ",score)

reg parameter:  1e-07 bias term:  10.551798313003891 rmse:  2.1603890133692256
reg parameter:  1e-05 bias term:  10.551789250706173 rmse:  2.160162806334615
reg parameter:  0.0001 bias term:  10.55181411948901 rmse:  2.1581263689695884
reg parameter:  0.001 bias term:  10.552138787253604 rmse:  2.1386896438262037
reg parameter:  0.01 bias term:  10.558949291459674 rmse:  2.006909309562738
reg parameter:  0.1 bias term:  10.610267920919583 rmse:  1.7205150609907158
reg parameter:  0 bias term:  2898548206313.519 rmse:  41142572269011.73
reg parameter:  1 bias term:  10.664258374787366 rmse:  1.5006036861388872
reg parameter:  10 bias term:  10.54085848645176 rmse:  1.3351128017668996


In [30]:
y_val

array([10.67570004,  8.72842609, 10.50495794, ..., 10.05406041,
       11.5337178 , 10.19283074])

In [31]:
y_pred

array([10.41238873, 10.89251626,  9.87258761, ..., 10.53592152,
       11.12281093, 11.10343224])