# Random Forests Regression

In [100]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [102]:
PATH = "data/bulldozers/"
!ls {PATH}

Test.csv  Train.csv  Valid.csv


In [103]:
df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory=False, parse_dates=["saledate"])

In [104]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [105]:
display_all(df_raw.isnull().sum())

SalesID                          0
SalePrice                        0
MachineID                        0
ModelID                          0
datasource                       0
auctioneerID                 20136
YearMade                         0
MachineHoursCurrentMeter    258360
UsageBand                   331486
saledate                         0
fiModelDesc                      0
fiBaseModel                      0
fiSecondaryDesc             137191
fiModelSeries               344217
fiModelDescriptor           329206
ProductSize                 210775
fiProductClassDesc               0
state                            0
ProductGroup                     0
ProductGroupDesc                 0
Drive_System                296764
Enclosure                      325
Forks                       209048
Pad_Type                    321991
Ride_Control                252519
Stick                       321991
Transmission                217895
Turbocharged                321991
Blade_Extension     

In [106]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

In [107]:
#feature engineering for date
add_datepart(df_raw, 'saledate')

In [112]:
# convert string to categorical variables
train_cats(df_raw)

In [114]:
# convert categorical variable to code+1, fill in missing values, split dependent/independent variables
df, y, nas = proc_df(df_raw, 'SalePrice')

In [115]:
# Time series, don't randomly pick the samples
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 12000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((389125, 66), (389125,), (12000, 66))

In [116]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [55]:
#Speed thing up for exploration
# df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice', subset=30000, na_dict=nas)
# X_train, _ = split_vals(df_trn, 20000)
# y_train, _ = split_vals(y_trn, 20000)

In [56]:
#base model -- 10 estimators
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)
#overfit terribly  -- try to generalize it

CPU times: user 59.4 s, sys: 196 ms, total: 59.6 s
Wall time: 11.1 s
[0.09043667848978602, 0.24873231568979093, 0.9829067740705186, 0.8895126899445241]


In [117]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.07837132803750145, 0.2379051601661563, 0.9871634227599747, 0.8989222192661487, 0.9081704882944486]


In [58]:
set_rf_samples(20000)

In [59]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.2270312732786261, 0.2615302154581393, 0.8922775565755625, 0.8778504933691862, 0.8807137812496173]


In [60]:
reset_rf_samples()

In [118]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.11917035464124978, 0.22806651256678734, 0.9703194947510612, 0.9071095605721724, 0.9116906498232545]


## Hyperparameters you can tune
1. estimator: 40, 
2. min_samples_leaf:  {1, 3, 5, 10, 25}
3. max_features: {0.5, sqrt, None} 
4. Bootstrap: True  -- should leave it on
5. set_rf_samples() -- boostrap but with limited samples

Turn on oob_score but it isn't as good as a separated validation set. The oob_score usually is better the validation set score ( since it's used in training by different trees.) 
For Time seriess, since oob is randomly choosen, it may not be a good indicator in time seriese.

## Test data set

In [153]:
df_rawtest = pd.read_csv(f'{PATH}Test.csv', low_memory=False, parse_dates=["saledate"])

In [154]:
add_datepart(df_rawtest, 'saledate')

In [155]:
# Apply categorical variables to test dataset
apply_cats(df_rawtest,df_raw)

In [174]:
df_rawtest['SalePrice']=0
df_rawtest['auctioneerID_na']=False
df_test, _, nas_test = proc_df(df_rawtest,'SalePrice')

In [173]:
## find missing columns
print(df_test.columns.isin(df.columns))
print(df.columns.isin(df_test.columns))
df.columns[64]

[ True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True]


'auctioneerID_na'

In [147]:
preds=m.predict(df_test)

In [150]:
preds

array([ 9.60345, 10.27393, 10.82605, ...,  9.36915,  9.51157,  9.97099])