# **Blue Book for Bulldozers - Model Bulding**

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
# plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float', "{:.4f}".format)

## Upload Train and Test Data

In [33]:
train = pd.read_csv('data/TrainAndValid.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

## Convert `saledate` into `datetime` dtype

In [35]:
train['saledate'] = pd.to_datetime(train.saledate)
test['saledate'] = pd.to_datetime(test.saledate)

train['SaleYear'] = train.saledate.dt.year
test['SaleYear'] = test.saledate.dt.year

## Set `saledate` as index, and sort data based on it

In [36]:
train.set_index('saledate', inplace=True)
train.sort_index(inplace=True)
train.index.name = None

test.set_index('saledate', inplace=True)
test.sort_index(inplace=True)
test.index.name = None

## Change the dtype object into category

In [37]:
# This will convert object type into category
for label, content in train.items():
    if pd.api.types.is_string_dtype(content):
        train[label] = content.astype("category").cat.as_ordered()
        
for label, content in test.items():
    if pd.api.types.is_string_dtype(content):
        test[label] = content.astype("category").cat.as_ordered()

## Filling numeric missing values

In [38]:
for label, content in train.items():
    if pd.api.types.is_numeric_dtype(content) and pd.isnull(content).sum():
        # Add a column that shows if the data was missing
        train[label + '_missing'] = pd.isnull(content)
        train[label].fillna(content.median(), inplace=True)
        
for label, content in test.items():
    if pd.api.types.is_numeric_dtype(content) and pd.isnull(content).sum():
        # Add a column that shows if the data was missing
        test[label + '_missing'] = pd.isnull(content)
        test[label].fillna(content.median(), inplace=True)

## Filling categorical missing values

In [39]:
for label, content in train.items():
    if not pd.api.types.is_numeric_dtype(content):
        train[label + '_missing'] = pd.isnull(content)
        train[label] = pd.Categorical(content).codes + 1

for label, content in test.items():
    if not pd.api.types.is_numeric_dtype(content):
        test[label + '_missing'] = pd.isnull(content)
        test[label] = pd.Categorical(content).codes + 1

In [40]:
print(train.isna().sum().sum())
print(test.isna().sum().sum())

0
0


## Model Building

In [41]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def evaluate(model, X, y):
    y_pred = model.predict(X)
    rmsle = np.sqrt(mean_squared_log_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    print("===============EVALUATION SCORE=================")
    print(f"ROOT MEAN SQUARE LOG ERROR: {rmsle:.4f}")
    print(f"MEAN ABSOLUTE ERROR: {mae:.4f}")

In [42]:
from sklearn.ensemble import RandomForestRegressor

X = train.drop('SalePrice', axis=1)
y = train.SalePrice

rf_model = RandomForestRegressor(n_estimators=100, n_jobs=-1)

rf_model.fit(X, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [43]:
evaluate(rf_model, X, y)

ROOT MEAN SQUARE LOG ERROR: 0.0856
MEAN ABSOLUTE ERROR: 1644.2569


## Train Test Split

In [44]:
train_data = train[train.SaleYear < 2012]
validation_data = train[train.SaleYear >= 2012]

print(train_data.shape)
print(validation_data.shape)

(401125, 99)
(11573, 99)


In [46]:
X_train = train_data.drop('SalePrice', axis=1)
y_train = train_data.SalePrice
X_validation = validation_data.drop('SalePrice', axis=1)
y_validation = validation_data.SalePrice

In [47]:
rf_model = RandomForestRegressor(n_estimators=100, oob_score=True)
rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

In [49]:
evaluate(rf_model, X_train, y_train)
evaluate(rf_model, X_validation, y_validation)

ROOT MEAN SQUARE LOG ERROR: 0.0857
MEAN ABSOLUTE ERROR: 1639.2209
ROOT MEAN SQUARE LOG ERROR: 0.2501
MEAN ABSOLUTE ERROR: 6056.7503


In [50]:
y_pred = rf_model.predict(test)

ValueError: Number of features of the model must match the input. Model n_features is 98 and input n_features is 97 

In [51]:
set(train.columns).difference(set(test.columns))

{'SalePrice', 'auctioneerID_missing'}

In [52]:
test['auctioneerID_missing'] = False

In [53]:
y_pred = rf_model.predict(test)