In [48]:
import pandas as pd
import numpy as np

import pickle
from sklearn.metrics import r2_score

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
    header=None,
    names=[
        "symboling",
        "normalized-losses",
        "make",
        "fuel-type",
        "aspiration",
        "num-of-doors",
        "body-style",
        "drive-wheels",
        "engine-location",
        "wheel-base",
        "length",
        "width",
        "height",
        "curb-weight",
        "engine-type",
        "num-of-cylinders",
        "engine-size",
        "fuel-system",
        "bore",
        "stroke",
        "compression-ratio",
        "horsepower",
        "peak-rpm",
        "city-mpg",
        "highway-mpg",
        "price",
    ],
)

In [49]:
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [50]:
# Clean data
df = df.replace("?", np.nan)

# Convert categorical variables to numerical
df["num-of-doors"] = df["num-of-doors"].replace({"four": 4, "two": 2})
df["num-of-cylinders"] = df["num-of-cylinders"].replace(
    {"four": 4, "six": 6, "five": 5, "eight": 8, "two": 2, "twelve": 12, "three": 3}
)

In [51]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown="ignore")
encoded_columns = [
    "fuel-type",
    "aspiration",
    "body-style",
    "drive-wheels",
    "engine-location",
    "engine-type",
    "fuel-system",
]
df_encoded = pd.DataFrame(
    encoder.fit_transform(df[encoded_columns]).toarray(),
    columns=encoder.get_feature_names(encoded_columns),
)
df.drop(encoded_columns, axis=1, inplace=True)
df = pd.concat([df, df_encoded], axis=1)



In [52]:
# drop missing values
df.dropna(inplace=True)

In [53]:
from sklearn.model_selection import train_test_split

X = df.drop(["price", "make"], axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [54]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
from sklearn.svm import SVR

params = {"C": 10, "gamma": "scale", "kernel": "linear"}

svr = SVR(**params)
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test) 
print("R-squared score for SVR: ", r2_score(y_test, y_pred_svr))

R-squared score for SVR:  0.7023101516976613


In [56]:
from sklearn.neural_network import MLPRegressor

params = {
    "activation": "identity",
    "alpha": 0.001,
    "hidden_layer_sizes": (100,),
    "solver": "lbfgs",
}

nn = MLPRegressor(**params)
nn.fit(X_train, y_train)
y_pred_nn = nn.predict(X_test)
print("R-squared score for Neural network: ", r2_score(y_test, y_pred_nn))

R-squared score for Neural network:  0.7345643452890137


In [98]:
from sklearn.ensemble import RandomForestRegressor

params = {
    "max_depth": None,
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "n_estimators": 100,
    "random_state": 256
}

rf = RandomForestRegressor(**params)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("R-squared score for Random Forest: ", r2_score(y_test, y_pred_rf))

R-squared score for Random Forest:  0.8812884411684906


In [58]:
X.columns

Index(['symboling', 'normalized-losses', 'num-of-doors', 'wheel-base',
       'length', 'width', 'height', 'curb-weight', 'num-of-cylinders',
       'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower',
       'peak-rpm', 'city-mpg', 'highway-mpg', 'fuel-type_diesel',
       'fuel-type_gas', 'aspiration_std', 'aspiration_turbo',
       'body-style_convertible', 'body-style_hardtop', 'body-style_hatchback',
       'body-style_sedan', 'body-style_wagon', 'drive-wheels_4wd',
       'drive-wheels_fwd', 'drive-wheels_rwd', 'engine-location_front',
       'engine-location_rear', 'engine-type_dohc', 'engine-type_dohcv',
       'engine-type_l', 'engine-type_ohc', 'engine-type_ohcf',
       'engine-type_ohcv', 'engine-type_rotor', 'fuel-system_1bbl',
       'fuel-system_2bbl', 'fuel-system_4bbl', 'fuel-system_idi',
       'fuel-system_mfi', 'fuel-system_mpfi', 'fuel-system_spdi',
       'fuel-system_spfi'],
      dtype='object')