FEATURE ENGINEERING USING OPEN SOURCE LIBRARIES

In [1]:
#data manipulation libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#for saving our pipeline
import joblib

#for data preparation and modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

#from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

In [2]:
#load the data
df = pd.read_csv('data.csv')

print(df.shape)

df.head()

(11914, 16)


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
#remove the spaces and format the case of each column titles
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
#Let's create our cat vars and num vars
cat_var = [var for var in df.columns if df[var].dtype == 'object']

In [5]:
year_var = 'year'

In [6]:
num_vars = [var for var in df.columns if var not in cat_var and var not in year_var and var != 'msrp']

In [7]:
#remove the spaces and format the case of each column values

for col in cat_var:
    try:
        if df[col].dtype == 'object':
            df[col] = df[col].str.lower().str.replace(' ', '_')
        else:
            df[col] = df[col]
    except TypeError:
        print('wrong data type')
       

In [8]:
#replace na or nan with 0 in the number_of_doors variable
df['number_of_doors'] = df['number_of_doors'].fillna(0)


#number_of_doors is in actual sense supposed to be a cat var, so we first round the decimal to whole number
for i in df.index:
    df['number_of_doors'] = round(df['number_of_doors'][i])

#update the data type for number of doors, because in reality it should be a categorical variable and not numerical
df['number_of_doors'] = df['number_of_doors'].astype('object')
df['engine_cylinders'] = df['engine_cylinders'].astype('object')

#add the new categorical variable to the list cat_var
cat_var = cat_var + ['number_of_doors', 'engine_cylinders']

Separate data into train and test data

In [9]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('msrp', axis= 1), #predictive features
                                                    df['msrp'], # target
                                                    test_size=.3,
                                                    random_state= 1,
                                                    )

x_train.shape, x_test.shape

((8339, 15), (3575, 15))

TARGET

In [10]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

HANDLING MISSING VALUES

In [25]:
#missing variable creation
var_with_na = [var for var in x_train.columns if x_train[var].isnull().sum() > 0]

#categorical missing variables
cat_var_na = [var for var in cat_var if var in var_with_na]
print('categorical variable with missing data: ',cat_var_na)

#numerical missing variables
num_var_na = [var for var in var_with_na if var not in cat_var_na ]
print('numerical variable with missing data: ',num_var_na)

#variables to input with the string missing
with_string_missing = [var for var in cat_var_na if x_train[var].isnull().mean()>0.1]
print('categorical variables to have their missing data replaced with missing: ', with_string_missing)

#variables to input with the most frequent categpry
with_freq_category = [var for var in cat_var_na if x_train[var].isnull().mean()<0.1]
print('categorical variables to have their missing data replaced with freq var', with_freq_category)

categorical variable with missing data:  ['engine_fuel_type', 'market_category', 'engine_cylinders']
numerical variable with missing data:  ['engine_hp']
categorical variables to have their missing data replaced with missing:  ['market_category']
categorical variables to have their missing data replaced with freq var ['engine_fuel_type', 'engine_cylinders']
