In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

#Setup random seed
import numpy as np 
np.random.seed(42)
df = pd.read_csv("car-sales.csv")
df.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
df.shape

(1000, 5)

In [2]:
df.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [6]:
#If we do drop the locs that where Price column have missing values, the data shape will be 950 rows.
df.dropna(subset = ["Price"]).shape

(950, 5)

In [7]:
df.dropna(subset = ["Price"], inplace = True)

In [8]:
df.isnull().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [11]:
#Define different features and transformer pipeline

categorical_features = ["Make","Colour"]
categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "constant",fill_value="missing")),
    ("onehot",OneHotEncoder(handle_unknown="ignore"))])

In [12]:
door_feature = ["Doors"]
door_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy= "constant", fill_value=4))])

numeric_features  = ["Odometer (KM)"]
numeric_tranformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="mean"))])


In [13]:
#Setup preprocessing steps (fill missing values, then convert to numbers)

preprocessor = ColumnTransformer(
                    transformers= [
                        ("cat", categorical_transformer,categorical_features),
                        ("door",door_transformer,door_feature),
                        ("num", numeric_tranformer,numeric_features)
                        ])


In [34]:
#creating a preprocessing and modelling pipeline

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor())])
X = df.drop("Price", axis = 1)
y = df["Price"]

X_train,X_test, y_train, y_test  = train_test_split(X, y, test_size=.2)

#Fit and score the model

model.fit(X_train,y_train)
model.score(X_test,y_test)



0.3219386708693074