# Bulldozer dataset

This notebook will use a datasets with a large number of columns (>30) and will propose and entire pipeline of treatments.

After this it will focus on the subjects of datasets distribution/randomness and features selection.

In [1]:
import os
import json
import random
from pathlib import Path
from zipfile import ZipFile

In [2]:
import kaggle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [4]:
df = pd.read_feather('tmp/feather')

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,11.09741,10.950807,9.21034,10.558414,9.305651
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3.0,3.0,3.0,3.0,3.0
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68.0,4640.0,2838.0,3486.0,722.0
UsageBand,1.0,1.0,0.0,0.0,2.0
fiModelDesc,949.0,1724.0,330.0,3673.0,4207.0


---

## Crossvalidation and model training

In [6]:
import sklearn.metrics as metrics

def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

### Let's begin

### Model fitting /w log(dependant variable)

In [7]:
X, y = df.drop('SalePrice', axis=1), df.SalePrice.values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
X_train.shape, type(X_train), y_train.shape, type(y_train)

((320900, 36), pandas.core.frame.DataFrame, (320900,), numpy.ndarray)

In [10]:
y_train = y_train.reshape(-1, 1)

In [11]:
y_test = y_test.reshape(-1, 1)

In [12]:
m = RandomForestRegressor(n_jobs=-1)

In [13]:
%time m.fit(X_train, y_train)



CPU times: user 8min 51s, sys: 5.65 s, total: 8min 56s
Wall time: 1min 14s


RandomForestRegressor(n_jobs=-1)

In [14]:
m.score(X_train, y_train)

0.9863907146232807

In [15]:
preds = m.predict(X_test)

In [16]:
regression_results(y_test, preds)

explained_variance:  0.9027
mean_squared_log_error:  0.0004
r2:  0.9027
MAE:  0.1536
MSE:  0.0469
RMSE:  0.2166


The base results are actually not bad, still if there's a little overfit !

---

### Log + shuffling

In [17]:
X, y = df.drop('SalePrice', axis=1), df.SalePrice.values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [19]:
X_train.shape, type(X_train), y_train.shape, type(y_train)

((320900, 36), pandas.core.frame.DataFrame, (320900,), numpy.ndarray)

In [20]:
y_train = y_train.reshape(-1, 1)

In [21]:
y_test = y_test.reshape(-1, 1)

In [22]:
m = RandomForestRegressor(n_jobs=-1)

In [23]:
%time m.fit(X_train, y_train)



CPU times: user 8min 53s, sys: 3.46 s, total: 8min 57s
Wall time: 1min 14s


RandomForestRegressor(n_jobs=-1)

In [None]:
m.score(X_train, y_train)

In [None]:
preds = m.predict(X_test)

In [None]:
regression_results(y_test, preds)

It's just a little bit better, maybe we should try to get a representative part of the dataset or selecting meaningful features ?

---

### Feature selection

Let's try to select the most usefull features with an automated approach

In [None]:
X.shape

In [None]:
X_train.shape

In [None]:
minimal_df_size = int(len(X_train)/100); minimal_df_size

In [None]:
# prepare bootstrap sample
X_train_boot = resample(X_train, n_samples=minimal_df_size, random_state=1)
X_train_boot.shape

In [None]:
# prepare bootstrap sample
y_train_boot = resample(y_train, n_samples=minimal_df_size, random_state=1)
y_train_boot.shape

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector

# #Selecting the Best important features according to RandomForestRegressor
# sfs_selector = SequentialFeatureSelector(estimator=RandomForestRegressor(), 
#                                         n_features_to_select=10,
#                                         cv=4,
#                                         direction ='backward')

# sfs_selector.fit(X_train_boot, y_train_boot)
# X.columns[sfs_selector.get_support()]

We try boostraping with the previous method of feature selection but it doesen't work.

Even with a very tiny part of our dataset (0.8% random parts from the original set) the computation take too much time (more than 10mins on an 8 core CPU).