FINAL MACHINE LEARNING PIPELINE


This pipeline features:
- open source classes
- in house package class
- only uses selected feature
- we score new data

# Reproducibility: Setting the seed

With the aim to ensure reproducibility between runs of the same notebook, but also between the research and production environment, for each step that includes some element of randomness, it is extremely important that we **set the seed**.

In [1]:
#data manipulation library
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

#for saving the pipeline
import joblib

#packages from scikit learn
from sklearn.ensemble import AdaBoostRegressor as abr
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer


#packages from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper


import preprocessors as pp

In [2]:
#loading dataset
df = pd.read_csv('data.csv')

#retrieve number of rows and colums
print(df.shape)

#visualizing sample shot of our data
df.head()

(11914, 16)


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [16]:
df.dtypes

make                  object
model                 object
year                   int64
engine_fuel_type      object
engine_hp            float64
engine_cylinders      object
transmission_type     object
driven_wheels         object
number_of_doors       object
market_category       object
vehicle_size          object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
popularity             int64
msrp                   int64
dtype: object

In [3]:
#remove the spaces and format the case of each column titles
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
#Let's create our cat vars which will enable us to cast some features to categorical type subsequently
cat_var = [var for var in df.columns if df[var].dtype == 'object']

In [5]:
#remove the spaces and format the case of each column values

for col in cat_var:
    try:
        if df[col].dtype == 'object':
            df[col] = df[col].str.lower().str.replace(' ', '_')
        else:
            df[col] = df[col]
    except TypeError:
        print('wrong data type')

In [6]:
#replace na or nan with 0 in the number_of_doors variable
df['number_of_doors'] = df['number_of_doors'].fillna(0)


#number_of_doors is in actual sense supposed to be a cat var, so we first round the decimal to whole number.
#also we round the values of engine_cylinder to whole number
for i in df.index:
    df['number_of_doors'] = round(df['number_of_doors'][i])
    df['engine_cylinders'] = round(df['engine_cylinders'][i])

#update the data type for number of doors, because in reality it should be a categorical variable and not numerical
df['number_of_doors'] = df['number_of_doors'].astype('object')
df['engine_cylinders'] = df['engine_cylinders'].astype('object')

#add the new categorical variable to the list cat_var
cat_var = cat_var + ['number_of_doors', 'engine_cylinders']

# seperate data into train and test

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('msrp', axis= 1), #predictive features
                                                    df['msrp'], # target
                                                    test_size=.3,
                                                    random_state= 1,
                                                    )

x_train.shape, x_test.shape

((8339, 15), (3575, 15))

# Target

In [8]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

# Configuration

In [9]:

CATEGORICAL_VARS_WITH_NA_FREQUENT = 'engine_fuel_type'

NUMERICAL_VARS_WITH_NA = 'engine_hp'

TEMPORAL_VAR =  'year'

DROPPED_VARS = 'year'

NUMERICAL_YEO_VARS = ['highway_mpg', 'city_mpg']

CATEGORICAL_VARS = 'engine_fuel_type'

#selected variables
FEATURES =[
    'engine_fuel_type', 
    'engine_hp',   
    'year' # this is important for computing the car age
]

In [10]:
x_train = x_train[FEATURES]
x_test = x_test[FEATURES]

x_train.shape, x_test.shape

((8339, 3), (3575, 3))

In [11]:
x_train.dtypes

engine_fuel_type     object
engine_hp           float64
year                  int64
dtype: object

# PIPELINE

In [13]:
msrp_pipeline = Pipeline([
    # ==== IMPUTATION ====

    ('frequent_imputation', CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)),
    
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    ('median_imputation', MeanMedianImputer(imputation_method='median', variables=NUMERICAL_VARS_WITH_NA)),

    # ==== TEMPORAL VARIABLE - CREATING NEW CAR AGE VAR ====
    ('elapsed_time', pp.CarAge(variables=TEMPORAL_VAR)),

    ('drop_features', DropFeatures(features_to_drop=DROPPED_VARS)),

    # === CATEGORIAL ENCODING ====
    ('rare_label_encoder', RareLabelEncoder(tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),

    #encoding the categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)),

    #scaling our feature parameters
    ('scalar', MinMaxScaler()),

    #final estimator
    ('abr', abr(random_state= 1, n_estimators=100))
])

In [14]:
#train the pipeline
msrp_pipeline.fit(x_train, y_train)

Pipeline(steps=[('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables='engine_fuel_type')),
                ('missing_indicator',
                 AddMissingIndicator(variables='engine_hp')),
                ('median_imputation', MeanMedianImputer(variables='engine_hp')),
                ('elapsed_time', CarAge(variables='year')),
                ('drop_features', DropFeatures(features_to_drop='year')),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1, tol=0.01,
                                  variables='engine_fuel_type')),
                ('categorical_encoder',
                 OrdinalEncoder(variables='engine_fuel_type')),
                ('scalar', MinMaxScaler()),
                ('abr', AdaBoostRegressor(n_estimators=100, random_state=1))])

In [15]:
# evaluate the model:
# ====================

# make predictions for train set
pred = msrp_pipeline.predict(x_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = msrp_pipeline.predict(x_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred), squared=False))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average car price: ', int(np.exp(y_train).median()))

train mse: 1170387686
train rmse: 34210
train r2: 0.6772432502215824

test mse: 1255953782
test rmse: 35439
test r2: 0.6493202628055048

Average car price:  29905
