### Import Data

In [16]:
import pandas as pd
import numpy as np

In [17]:
data = pd.read_csv('data.csv')

In [18]:
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


### Clean dataframe

In [19]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [20]:
str_cols = [col for col in data.columns if data[col].dtype == 'object']
for col in str_cols:
    data[col] = data[col].str.lower().str.replace(' ', '_')

In [21]:
data['engine_fuel_type'].unique()

array(['premium_unleaded_(required)', 'regular_unleaded',
       'premium_unleaded_(recommended)', 'flex-fuel_(unleaded/e85)',
       'diesel', 'electric',
       'flex-fuel_(premium_unleaded_recommended/e85)', 'natural_gas',
       'flex-fuel_(premium_unleaded_required/e85)',
       'flex-fuel_(unleaded/natural_gas)', nan], dtype=object)

In [22]:
data.dropna(axis=0, subset=['msrp'], inplace=True)
data.isnull().sum(axis=0)

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [23]:
y = data.msrp
X = data.drop(['msrp'], axis=1)

### Split of the dataset

In [24]:
from sklearn.model_selection import train_test_split

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=.8, random_state=0)

In [25]:
str_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() <= 10]
n_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

cols = str_cols + n_cols

X_train = X_train_full[cols].copy()
X_valid = X_valid_full[cols].copy()

### Preprocessing steps

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [27]:
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, n_cols),
    ('cat', categorical_transformer, str_cols)
])

### The model

In [28]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [29]:
from sklearn.metrics import mean_absolute_error

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['year', 'engine_hp',
                                                   'engine_cylinders',
                                                   'number_of_doors',
                                                   'highway_mpg', 'city_mpg',
                                                   'popularity']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['engine_fuel_type',
                                 

In [30]:
preds = clf.predict(X_valid)

print('MAE: ', mean_absolute_error(y_valid, preds))

MAE:  3679.4486441891618
