In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
import os

from IPython.display import display
import numpy as np
import pandas as pd
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from fastai.imports import *
from fastai.structured import *

In [None]:
PATH = '../input'
!ls {PATH}

In [None]:
# Read in the training dataset.
df_raw = pd.read_csv(f'{PATH}/train/Train.csv', low_memory=False, parse_dates=['saledate'])

In [None]:
# To calculate RMSLE (root mean squared log error) between actual and predicted price.
df_raw.SalePrice = np.log(df_raw.SalePrice)

In [None]:
# Break date into multiple columns.
add_datepart(df_raw, 'saledate')

In [None]:
# Convert from strings to Pandas categories.
train_cats(df_raw)

In [None]:
# Correct order of Ordinal categorical variables.
print(df_raw.UsageBand.cat.categories)

df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)

print(df_raw.UsageBand.cat.categories)

In [None]:
# Convert categories into category codes.
df_raw.UsageBand = df_raw.UsageBand.cat.codes

In [None]:
# Process a DataFrame for training a machine learning algorithm.
df, y, nas = proc_df(df_raw, 'SalePrice')

In [None]:
# Let's train a RandomForest.
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df, y)

# Split into Training and Validation dataset

In [None]:
def split_items(x, n):
    return x[:n].copy(), x[n:].copy()

In [None]:
n_validation = 12000
n_train = len(df) - n_validation

X_train, X_validation = split_items(df, n_train)
y_train, y_validation = split_items(y, n_train)

X_train.shape, y_train.shape, X_validation.shape, y_validation.shape

In [None]:
def rmse(x, y):
    return math.sqrt(((x - y) ** 2).mean())

def print_score(m):
    result = [
        rmse(m.predict(X_train), y_train),
        rmse(m.predict(X_validation), y_validation),
        m.score(X_train, y_train),
        m.score(X_validation, y_validation)
    ]
    
    if hasattr(m, 'oob_score_'):
        result.append(m.oob_score_)
    
    print(result)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

# Speeding Things Up!

Working with a smaller subset for running iterative experiments.

In [None]:
df_subset, y_subset, nas = proc_df(df_raw, 'SalePrice', subset=30000, na_dict=nas)

# NOTE: We are not touching the validation dataset to make effective comparisons between models.
X_train, _ = split_items(df_subset, 20000)
y_train, _ = split_items(y_subset, 20000)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

# What does a tree look like?

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=1, bootstrap=False, max_depth=3)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
# Drawing the first and only tree in the RandomForest.
draw_tree(m.estimators_[0], df_subset, precision=3)

In [None]:
# Creating a bigger tree.
m = RandomForestRegressor(n_estimators=1, n_jobs=-1, bootstrap=False)
%time m.fit(X_train, y_train)
print_score(m)

# Bagging

Using multiple trees!

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
predictions = np.stack([estimator.predict(X_validation) for estimator in m.estimators_])

print('Predictions for first sample from all trees: {}'.format(predictions[:, 0]))
print('Mean of predictions for first sample: {}'.format(np.mean(predictions[:, 0])))
print('Actual value: {}'.format(y_validation[0]))

In [None]:
plt.plot([
    metrics.r2_score(
        y_validation,
        np.mean(predictions[:i+1], axis=0)
    )
    for i in range(10)
]);

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=20)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=40)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=80)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=160)
%time m.fit(X_train, y_train)
print_score(m)

# Out-of-bag Score

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=40, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

# Subsampling Using all of the Dataset

In [None]:
df_subset, y_subset, nas = proc_df(df_raw, 'SalePrice')

# NOTE: We are not touching the validation dataset to make effective comparisons between models.
X_train, X_validation = split_items(df_subset, n_train)
y_train, y_validation = split_items(y_subset, n_train)

In [None]:
# Instead of a subset, create a random sample out of the entire dataset.
set_rf_samples(20000)

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=40, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=80, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
reset_rf_samples()

In [None]:
# Requiring some minimum number of rows in every leaf node.
m = RandomForestRegressor(n_jobs=-1, n_estimators=40, oob_score=True, min_samples_leaf=3)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
# Using a sample of couples for each split.
m = RandomForestRegressor(n_jobs=-1, n_estimators=40, oob_score=True, min_samples_leaf=3, max_features=0.5)
%time m.fit(X_train, y_train)
print_score(m)

# Test Dataset

Get predictions from the trained model on the test dataset for submitting to Kaggle.

In [None]:
test_df = pd.read_csv(f'{PATH}/Test.csv', low_memory=False, parse_dates=['saledate'])

In [None]:
# Prepare dataset for machine learning.
add_datepart(test_df, 'saledate')
train_cats(test_df)

test_df.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)
test_df.UsageBand = test_df.UsageBand.cat.codes

test_df, y, nas = proc_df(test_df)

In [None]:
print(set(df.columns) - set(test_df.columns))

# As a hack adding the column to the test dataset too.
test_df['auctioneerID_na'] = False

print(set(df.columns) - set(test_df.columns))

In [None]:
# Get predictions on processed test dataset.
predictions = m.predict(test_df)

In [None]:
submission = pd.DataFrame({'SalesID': test_df.SalesID, 'SalePrice': predictions})
submission.to_csv('submission.csv', index=False)