# Using Random Forest Model to Predict House Prices in Ames, Iowa
### In this notebook, we are through the basics of using the unsupervised random forest algorithm on the Ames Housing Dataset. 


### Load libraries

In [18]:
import pandas as pd
import psutil
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler, RobustScaler
from scipy import stats
from math import ceil
from math import sqrt
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
import statistics
import sklearn.model_selection as ms
from scipy.stats import boxcox
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Import the train and test dataset

In [2]:

train = pd.read_csv('train6.csv')
test = pd.read_csv('test6.csv')

### Assigning our Variables 

In [3]:
x = pd.DataFrame(train.iloc[:,:-1])
x = pd.DataFrame(train.iloc[:,1:])
x = x.drop(['ylogSalePrice'], axis = 1)
y = (train.iloc[:,-1])


### Train Test Split 

In [4]:

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.33, random_state = 66)

### Cross Validation  

In [5]:

kf = KFold(n_splits=10, shuffle=True, random_state=99)

### Error Metrics

In [6]:

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, x = x):
    rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring = 'neg_mean_squared_error', cv = kf))
    return (rmse)


### Train the Model 

In [7]:
rforest = RandomForestRegressor()
rforest.set_params(random_state = 30)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=30, verbose=0,
                      warm_start=False)

In [8]:
rforest.fit(x,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=30, verbose=0,
                      warm_start=False)

The R-Squared for both the test and train. 

In [9]:
print('The train set R^2 is : %5f' % rforest.score(x_train, y_train))
print('The train set R^2 is : %5f' % rforest.score(x_test, y_test))


The train set R^2 is : 0.979981
The train set R^2 is : 0.967143


### Predict test data  

In [12]:
# Find the prediction 
rforest_pred = rforest.predict(x_test)

Calculating Errors
Given that we are interested in how far away our average prediction is from the actual value, we take the absolute value.

In [21]:


def printErrors(Yt, predMod):
    """
    1. Yt = the trained predicted values
    2. predMod = the prediction from the fitted model
    It thens prints the errors for the model ie MAE, MSE, RMSLE
    """
    
    errors = abs(np.expm1(predMod) - np.expm1(Yt))
    print('Errors')
    print("-" * 50)
    print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
    print("-" * 50)
    print('Mean Squared Error (MSE):', mean_squared_error(Yt, predMod))
    print("-" * 50)
    print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(Yt, predMod)))
    print("-" * 50)
    print('Root Mean Square Log Error (RMSLE):', np.sqrt(mean_squared_log_error(Yt, predMod)))
    print("-" * 50)
    mape = 100 * (errors / np.expm1(Yt))
    print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')
    print("-" * 50)
    accuracy = 100 - np.mean(mape)
    print('Accuracy')
    print("-" * 50)
    print('Accuracy:', round( accuracy,2),'%.' )

printErrors(y_test, rforest_pred)

Mean Absolute Error (MAE): $ 6915.75
Mean Squared Error (MSE): 0.0046791561051761415
Root Mean Square Error (RMSE): 0.06840435735518711
Root Mean Square Log Error (RMSLE): 0.0054715443822674365
Mean Absolute Percent Error (MAPE): 4.37 %.
Accuracy: 95.63 %.


### Hyperparameter Tuning : Grid Search 

In [22]:

param_grid ={'max_depth': [20, 25, 30],
             'max_features': ['auto','sqrt','log2'],
             'min_samples_split':[2,3,4],
             'min_samples_leaf':[1, 3, 5],
             'n_estimators': [500, 750, 1000, 1250, 1500]}



In [None]:
grid_search_rforest = ms.GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', 
                                     cv= kf, n_jobs=-1, return_train_score = True)

%time grid_search_rforest.fit(x, y)



In [None]:
grid_search_rforest.best_params_

R - Squared

In [None]:
print("The train set R^2 is: %.5f" % grid_rforest.score(X_train, y_train))
print("The test set R^2 is is: %.5f" % grid_rforest.score(X_test, y_test))


In [None]:
printErrors(y_test, grid_rforest.predict)

In [None]:
score = rmse_cv(grid_rforest)
print("\nRandom Forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


In [None]:
y_train_rForest = grid_rforest.predict(X_train)
y_test_rForest = grid_rforest.predict(X_test)

# Plot predictions
plt.figure(figsize=(12,8))
plt.scatter(np.expm1(y_train_rForest), np.expm1(y_train), c='black', marker="o", s=15, label = "Training data")
plt.scatter(np.expm1(y_test_rForest), np.expm1(y_test), c='orange', marker='o', s=15, label = "Validation data")
plt.title("Random Forest", fontsize = 20)
plt.xlabel("Predicted Prices", fontsize = 16)
plt.ylabel("Actual Prices", fontsize = 16)
plt.xlim(0, 800000)
plt.ylim(0, 800000)
plt.legend(loc = "upper left")
plt.plot([0, 800000], [0, 800000], c = "grey")
plt.show()


In [None]:
feats_rf = list(rforest.feature_importances_)

# List of tuples with variable and importance
feats_rf_score = [(feature, round(importance, 5)) for feature, importance in zip(X_train.columns, feats_rf)]

# Sort the feature importances by most important first
sorted_feats_rf_score = sorted(feats_rf_score, key = lambda x: x[1], reverse = True )

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in sorted_feats_rf_score]



In [None]:
rf_feature_importances_top20 = sorted_feats_rf_score[:20]
featureNames, featureScores = zip(*list(rf_feature_importances_top20))

plt.barh(range(len(featureScores)), featureScores, tick_label=featureNames)
plt.gca().invert_yaxis()
plt.title('feature importance')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Feature Importances')



In [None]:
feature_importance = 100.0 * (grid_rforest.feature_importances_ / grid_rforest.feature_importances_.max())
important_features = X_train.columns[feature_importance >= 0]
unimportant_features = X_train.columns[feature_importance < 0]


In [None]:
X_train_reduced = X_train.drop(unimportant_features, axis=1)
X_test_reduced = X_test.drop(unimportant_features, axis=1)


In [None]:
# GridSearchCV for multiple hyperparameters:
rForest_feats = RandomForestRegressor()

param_grid ={'max_depth': [10, 20, 30, 40, 50],
             'max_features': ['auto','sqrt','log2'],
             'min_samples_split':[2,3,4],
             'n_estimators': [100, 300, 500, 800]}

rForest_feats.set_params(random_state=42)

grid_search_rForest2 = GridSearchCV(rForest_feats, param_grid, scoring= 'neg_mean_squared_error',
                           cv= kf, n_jobs = -1, return_train_score=True, verbose = 1)
grid_search_rForest2.fit(X_train_reduced, y_train)



In [None]:
grid_search_rForest2.best_params_


In [None]:
# Use the forest's predict method on the test data
predictions_tuned_rForest2 = grid_search_rForest2.best_estimator_.predict(X_test_reduced)



In [None]:
printErrors(y_test, predictions_tuned_rForest2)

In [None]:
test_rForest = grid_rforest.predict(test)
test_predictions_tuned_rForest2 = grid_search_rForest2.best_estimator_.predict(test)


In [None]:
Ids = np.arange(1461, 2920, 1)


In [None]:
test_rForest = np.exp(test_rForest) - 1

In [None]:
pred = pd.DataFrame({'Id': Ids,'SalePrice': test_rForest})

In [None]:
pred.to_csv('predication1.csv', index = False)