In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Import car-sales-extended.csv
car_sales = pd.read_csv("../datos/car-sales-extended.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [3]:
# Split into X & y and train/test
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

### Turn the categories (Make and Colour) into numbers

In [4]:
# Turn the categories (Make and Colour) into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = [ "Make", "Colour", "Doors" ] # se tratan las puertas como categorías, carros de 3, 4 ó 5 puertas
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough") # para las columnas no seleccionadas siga derecho y tome los datos tal cual
transformed_X = transformer.fit_transform( X )
transformed_X
pd.DataFrame( transformed_X ).head() # for a better visualisation

# bwm, honda, nissan, toyota, black, blue, green, red, white, 3doors, 4doors, 5doors, odometer 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [17]:
# DUMMY WAY - to nicely see the process of encoding (with labels and it a dataframe straightway)
# First: Have to convert doors to object for dummies to work on it...
car_sales["Doors"].astype(object)
# then the dummie process
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [6]:
# Split into X & y and train/test
X_categorical = transformed_X # the categorical-encoded matrix
y = car_sales["Price"]

# Splitting the data into training and test sets
from sklearn.model_selection import train_test_split

# important to use "random_state" in order to have reproducable training sets
X_train, X_test, y_train, y_test = train_test_split( X_categorical, y, test_size=0.2, random_state=42 )

# Try to predict with random forest on price column (doesn't work)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit( X_train, y_train )
model.score( X_test, y_test )

0.32343612912490627

# Missing data

### options:
### 1. fill the values with some data (imputation)
### 2. remove the samples with missing data

In [7]:
# Import car sales dataframe with missing values
car_sales_missing = pd.read_csv("../datos/car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [8]:
# counting missing values
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Imputation (filling missing data) with pandas

In [9]:
# In order to do the regression we need to do something with missing values

# Fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column with the mean value
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

# Fill the "Doors" column => with 4 doors 'by default'
car_sales_missing["Doors"].fillna(4, inplace=True)

# Remove rows with missing "Price" labels
car_sales_missing.dropna(inplace=True)

# Check out if no missing values now
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [11]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [26]:
# Split into X & y and train/test
X_missing = car_sales_missing.drop("Price", axis=1)
y_missing = car_sales_missing["Price"]

# Encode the categories (Make, Colour and doors) 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features_missing = ["Make", "Colour", "Doors"] # se tratan las puertas como categorías, carros de 3, 4 ó 5 puertas
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features_missing)],
                                 remainder="passthrough") # para las columnas no seleccionadas siga derecho y tome los datos tal cual
transformed_X_missing = transformer.fit_transform( X_missing )

# pd.DataFrame( transformed_X_missing ).head() # for a better visualisation

# Splitting the data into training and test sets
from sklearn.model_selection import train_test_split

# defining the seudo-random seed in 42
np.random.seed(42)
X_train_missing, X_test_missing, y_train_missing, y_test_missing = train_test_split( transformed_X_missing, y_missing, test_size=0.2, random_state=42 )

# Try to predict with random forest on price column (doesn't work)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train_missing, y_train_missing)
model.score(X_test_missing, y_test_missing)

0.20784310291416586

In [27]:
# transformed_X_missing # sparse matrix: when most of values are zeros
pd.DataFrame( transformed_X_missing.toarray() ) # for a better visualisation. => However, 
# when the transformed_X_missing is a sparse matrix it shows in its compressed form
# so, whith this ".toarray()" it uncompresses the matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0


In [16]:
# DUMMY WAY - to nicely see the process of encoding (with labels and it a dataframe straightway)
# First: Have to convert doors to object for dummies to work on it...
car_sales_missing["Doors"].astype(object)
# then the dummie process
dummies = pd.get_dummies(car_sales_missing[["Make", "Colour", "Doors"]])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Make_missing,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Colour_missing
0,4.0,0,1,0,0,0,0,0,0,0,1,0
1,5.0,1,0,0,0,0,0,1,0,0,0,0
2,4.0,0,1,0,0,0,0,0,0,0,1,0
3,4.0,0,0,0,1,0,0,0,0,0,1,0
4,3.0,0,0,1,0,0,0,1,0,0,0,0
