## 7. Put it all together

In [2]:
data=pd.read_csv("car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

Steps we want to do (All in one cell)
1. Fill missing data
2. convert data to numbers
3. build a model on the data

In [5]:
import numpy as np
import pandas as pd

# Getting data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


#Modeling 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

#Import Data and drop rows with missing labels

data=pd.read_csv("car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# Define differrent features and transformer pipeline
categorical_feature=["Make","Colour"]
categorical_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
    ("onehot",OneHotEncoder(handle_unknown="ignore"))])

door_feature=["Doors"]
door_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value=4)),
])

numeric_features=["Odometer (KM)"]
nuneric_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="mean"))
    ])


##setup preprocessing steps (fill missing values, then convert to number)

preprocessor=ColumnTransformer(
                transformers=[
                    ("cat",categorical_transformer,categorical_feature),
                    ("doors",door_transformer,door_feature),
                    ("num",nuneric_transformer,numeric_features),
                ])

# Creating a preprocessing and modelling pipeline
model=Pipeline(steps=[("preprocessor",preprocessor),
                      ("mode",RandomForestRegressor())
                      ])

# Split data

x=data.drop("Price",axis=1)
y=data["Price"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#Fit and score the model
model.fit(x_train,y_train)
model.score(x_test,y_test)


0.22188417408787875

In [6]:
preprocessor

# It is also possible to use 'GridsearchCV' with out Pipeline

In [10]:
# Use gridsearchCV with our regression Pipeline
from sklearn.model_selection import GridSearchCV
pipe_grid={
    "preprocessor__num__imputer__strategy":["mean","median"],
    "model__n_estimators":[100,1000],
    "model__max_depth":[None,5],
    "model__max_feature":["auto"],
    "model__min_samples_split":[2,4]

}

gs_model=GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(x_test,y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: Invalid parameter 'model' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Make', 'Colour']),
                                                 ('doors',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=4,
                                                                                 strategy='constant'))]),
                                                  ['Doors']),
                                                 ('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['Odometer (KM)'])])),
                ('mode', RandomForestRegressor())]). Valid parameters are: ['memory', 'steps', 'verbose'].