## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 8)
pd.set_option('display.max_columns', 100)

## Read the data

In [2]:
train = './data/train.csv'
test = './data/test.csv'

df = pd.read_csv(train)
df_test = pd.read_csv(test)
df.sample(n=5)

test_id = df_test.Id

## Data manipulation

In [3]:
df['Garage Type'] = df['Garage Type'].fillna('None')
df_test['Garage Type'] = df_test['Garage Type'].fillna('None')

df['Garage Cars'] = df['Garage Cars'].fillna(0)
df_test['Garage Cars'] = df_test['Garage Cars'].fillna(0)

In [4]:
missing_data = df.isnull().sum()[df.isnull().sum() > 0]
missing_data = list(missing_data.index)
missing_data

df.drop(missing_data, axis=1, inplace=True)
df_test.drop(missing_data, axis=1, inplace=True)
print(df.shape)
print(df_test.shape)

(2051, 57)
(879, 56)


## Feature engineering

In [5]:
garage_dummies = pd.get_dummies(df['Garage Type'], prefix='Garage')
garage_dummies_test = pd.get_dummies(df_test['Garage Type'], prefix='Garage')

In [6]:
df['MS Zoning'] = df['MS Zoning'].str.replace('R.', 'R', regex=True)
df_test['MS Zoning'] = df_test['MS Zoning'].str.replace('R.', 'R', regex=True)
# All residentials as a single variable

zoning_dummies = pd.get_dummies(df['MS Zoning'], prefix='Zoning')
zoning_dummies_test = pd.get_dummies(df_test['MS Zoning'], prefix='Zoning')

zoning_dummies = zoning_dummies[['Zoning_R']]
zoning_dummies_test = zoning_dummies_test[['Zoning_R']]

In [7]:
h_style_dummies = pd.get_dummies(df['House Style'], prefix='Style')
h_style_dummies_test = pd.get_dummies(df_test['House Style'], prefix='Style')

h_style_dummies = h_style_dummies[['Style_1Story', 'Style_2Story']] # Just the 2 most common
h_style_dummies_test = h_style_dummies_test[['Style_1Story', 'Style_2Story']] # Just the 2 most common

In [8]:
sale_type_dummies = pd.get_dummies(df['Sale Type'], prefix='Sale')
sale_type_dummies_test = pd.get_dummies(df_test['Sale Type'], prefix='Sale')

sale_type_dummies = sale_type_dummies[['Sale_New', 'Sale_WD ']] # Looking for new or WD
sale_type_dummies_test = sale_type_dummies_test[['Sale_New', 'Sale_WD ']] # Looking for new or WD

In [9]:
dummies = pd.concat([garage_dummies, zoning_dummies, h_style_dummies, sale_type_dummies], axis=1)
dummies_test = pd.concat([garage_dummies_test, zoning_dummies_test, h_style_dummies_test, sale_type_dummies_test], axis=1)

## Feature and predictor selection

In [10]:
feature_cols = ['Overall Qual', 'Year Built', 'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Lot Area']
X = df[feature_cols]
X = pd.concat([X, dummies], axis = 1)
print(X.shape)

X_df_test = df_test[feature_cols]
X_df_test = pd.concat([X_df_test, dummies_test], axis = 1)
print(X_df_test.shape)

(2051, 18)
(879, 18)


In [11]:
y = df['SalePrice']

## Train test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 99)

## Model fitting

In [20]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

## Model prediction

In [21]:
y_pred = rfr.predict(X_test)

## Cross Validation

In [22]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

print(np.mean(-cross_val_score(rfr, X, y, cv=kf, scoring='neg_mean_squared_error')))
print(np.mean(cross_val_score(rfr, X, y, cv=kf)))

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


995843715.2916504
0.8502645070887098


  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


## Predict with test data

In [23]:
y_pred = rfr.predict(X_df_test)

prediction = pd.concat([test_id, pd.Series(y_pred)], axis = 1)
prediction.columns = ['Id', 'SalePrice']

prediction.sample(5)

Unnamed: 0,Id,SalePrice
237,2647,104285.0
404,438,279630.0
718,574,251430.0
60,1103,296100.0
726,2251,179900.0


In [24]:
prediction.to_csv("Emilio_Try4_RandomForestOOB.csv", index=False)