In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Filling missing data with SciKit - method SimpleImputer
#### scikit-learn provides another method called SimpleImputer() which allows us to do a similar thing
SimpleImputer() transforms data by filling missing values with a given strategy

Let's reimport it so it has missing values and we can fill them with Scikit-Learn.

In [2]:
# Reimport the DataFrame
car_sales_missing = pd.read_csv("../datos/car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [3]:
# Drop the rows with missing in the "Price" column
car_sales_missing.dropna(subset=["Price"], inplace=True)

# Splitting the data into training and test sets
from sklearn.model_selection import train_test_split

# Split into X and y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# Split data into train and test
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

#### Note: We split data into train & test to perform filling missing values on them separately:
Sobre todo en el caso del odómetro, cuando llenamos los valores faltantes con el promedio de TODOS los datos, sin haberlos separados primero en 'train' y 'test', estamos dejando influir los datos del 'train' en el 'set' o viceversa, ocasionando errores de tipo optimista

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Define columns (column features)
categorical_features = ["Make", "Colour"]
door_feature = ["Doors"]
numerical_feature = ["Odometer (KM)"]

# Fill categorical values with constant value 'missing' & numerical with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define the imputer with its parameters respectively
# (what fills the missing data -and how it does it-)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, categorical_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, numerical_feature)])
# imputer takes three parameters (for every column): ImputerName, imputer, and columnToImpute

# UP TO HERE, WE HAVE DEFINED THE IMPUTER, LET'S USE IT

In [5]:
# APPLYING THE IMPUTER
# Note: We use fit_transform() on the training data and transform() on the testing data.

# In essence, we learn the patterns in the training set and transform it via imputation 
# (fit, then transform). Then we take those same patterns and fill the test set (transform only).

# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

In [8]:
# check if there are no missing values in the filled training dataset

# first we need to transform it to a pd dataframe
filled_X_train_pd = pd.DataFrame(filled_X_train,
                                columns=["Make", "Colour", "Doors", "Odometer (KM)"])

filled_X_train_pd.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [9]:
# check if there are no missing values in the filled test dataset

# first we need to transform it to a pd dataframe
filled_X_test_pd = pd.DataFrame(filled_X_test,
                                columns=["Make", "Colour", "Doors", "Odometer (KM)"])

filled_X_test_pd.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [10]:
# Get our transformed data array's back into DataFrame's view

# => in order to be read by the OneHotEncoder (convert categories to numbers)
# => OneHotEnconder only works with dataframes <=

# in the case of sickit-1 notebook, we were working directly with dataframes and pandas

car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [11]:
# ENCODING CATEGORIES (Make, Colour and doors): FROM TEXT TO NUMBERS

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.transform(car_sales_filled_test)

# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [12]:
# Now we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

# define the model
model = RandomForestRegressor()

# Apply the ML model
# (Make sure to use transformed -filled and one-hot encoded X data-)
model.fit(transformed_X_train, y_train)

# Check out the score
model.score(transformed_X_test, y_test)

0.21229043336119102