FEATURE ENGINEERING USING BOTH OPEN SOURCE AND DEVELOPED IN-HOUSE SOFTWARE CLASSES 

In [1]:
#Importing our data manipulation packages
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt


#for saving the pipeline 
import joblib

#useful packages from scikit learn
from sklearn import ensemble
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

#from feature engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper


import preprocessors as pp


In [2]:
#loading dataset
df = pd.read_csv('data.csv')

#retrieve number of rows and colums
print(df.shape)

#visualizing sample shot of our data
df.head()

(11914, 16)


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
#remove the spaces and format the case of each column titles
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
#Let's create our cat vars which will enable us to cast some features to categorical type subsequently
cat_var = [var for var in df.columns if df[var].dtype == 'object']

In [5]:
#remove the spaces and format the case of each column values

for col in cat_var:
    try:
        if df[col].dtype == 'object':
            df[col] = df[col].str.lower().str.replace(' ', '_')
        else:
            df[col] = df[col]
    except TypeError:
        print('wrong data type')
       

In [6]:
#replace na or nan with 0 in the number_of_doors variable
df['number_of_doors'] = df['number_of_doors'].fillna(0)


#number_of_doors is in actual sense supposed to be a cat var, so we first round the decimal to whole number.
#also we round the values of engine_cylinder to whole number
for i in df.index:
    df['number_of_doors'] = round(df['number_of_doors'][i])
    df['engine_cylinders'] = round(df['engine_cylinders'][i])

#update the data type for number of doors, because in reality it should be a categorical variable and not numerical
df['number_of_doors'] = df['number_of_doors'].astype('object')
df['engine_cylinders'] = df['engine_cylinders'].astype('object')

#add the new categorical variable to the list cat_var
cat_var = cat_var + ['number_of_doors', 'engine_cylinders']

SEPARATE DATA INTO TRAIN AND TEST

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('msrp', axis= 1), #predictive features
                                                    df['msrp'], # target
                                                    test_size=.3,
                                                    random_state= 1,
                                                    )

x_train.shape, x_test.shape

((8339, 15), (3575, 15))

In [20]:
#this bit of code block is used in our feature selection python module. 
x_train.to_csv('x_train.csv', index_label=False)
x_test.to_csv('x_test.csv', index_label=False)

In [21]:
#this bit of code block is used in our feature selection python module
y_train.to_csv('y_train.csv', index_label=False)
y_test.to_csv('y_test.csv', index_label=False)

TARGET

In [8]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

CONFIGURATION

In [9]:

CATEGORICAL_VARS_WITH_NA_MISSING = 'market_category'
CATEGORICAL_VARS_WITH_NA_FREQUENT = 'engine_fuel_type'
NUMERICAL_VARS_WITH_NA = 'engine_hp'
TEMPORAL_VAR =  'year'
DROPPED_VARS = ['year', 'model']
NUMERICAL_YEO_VARS = ['highway_mpg','city_mpg', 'popularity','engine_hp']
CATEGORICAL_VARS =['make',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style',
 'number_of_doors',
 'engine_cylinders']


In [10]:
x_train['year'].dtype

dtype('int64')

PIPELINE CONSTRUCTION

In [11]:
msrp_pipeline = Pipeline([
    # ==== IMPUTATION ====

    ('missing_imputation', CategoricalImputer(imputation_method='missing', variables= CATEGORICAL_VARS_WITH_NA_MISSING)),

    ('frequent_imputation', CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)),
    
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    ('median_imputation', MeanMedianImputer(imputation_method='median', variables=NUMERICAL_VARS_WITH_NA)),

    # ==== TEMPORAL VARIABLE - CREATING NEW CAR AGE VAR ====
    ('elapsed_time', pp.CarAge(variables=TEMPORAL_VAR)),

    ('drop_features', DropFeatures(features_to_drop=DROPPED_VARS)),

    # ==== VARIABLE TRANSORMATION
    ('yeojohnson', YeoJohnsonTransformer(variables=NUMERICAL_YEO_VARS)),

    # === CATEGORIAL ENCODING ====
    ('rare_label_encoder', RareLabelEncoder(tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),

    #encoding the categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)),
])

In [12]:
#train the pipeline
msrp_pipeline.fit(x_train, y_train)



Pipeline(steps=[('missing_imputation',
                 CategoricalImputer(variables='market_category')),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables='engine_fuel_type')),
                ('missing_indicator',
                 AddMissingIndicator(variables='engine_hp')),
                ('median_imputation', MeanMedianImputer(variables='engine_hp')),
                ('elapsed_time', CarAge(variables=...
                                  variables=['make', 'engine_fuel_type',
                                             'transmission_type',
                                             'driven_wheels', 'market_category',
                                             'vehicle_size', 'vehicle_style',
                                             'number_of_doors',
                                             'engine_cylinders'])),
                ('categorical_encoder',
               

In [13]:
x_train = msrp_pipeline.transform(x_train)
x_test = msrp_pipeline.transform(x_test)

In [14]:
# check absence of na in the train set
[var for var in x_train.columns if x_train[var].isnull().sum() > 0]

[]

In [16]:
# check absence of na in the test set
[var for var in x_test.columns if x_test[var].isnull().sum() > 0]

[]

In [17]:
# the parameters are learnt and stored in each step
# of the pipeline

msrp_pipeline.named_steps['frequent_imputation'].imputer_dict_

{'engine_fuel_type': 'regular_unleaded'}

In [18]:
x_train.head()

Unnamed: 0,make,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,engine_hp_na,age
10660,8,0,4.406687,0,2,0,0,1,0,2,1.453285,1.113687,28.519991,0,1
4140,25,0,4.887654,0,2,2,0,11,2,11,1.422067,1.110025,20.134481,0,6
11414,10,2,4.45238,0,2,0,0,1,2,7,1.415625,1.091493,17.557344,0,15
5119,23,5,4.879183,0,0,2,0,14,1,9,1.433747,1.09683,10.437919,0,4
2639,11,0,4.528372,0,0,1,0,1,2,4,1.393279,1.078885,19.232728,0,19


This wraps up our feature engineering pipeline steps