<a href="https://colab.research.google.com/github/dheerajrathee/Bike-Share-Data-Udacity/blob/master/RandomForest_IADS2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# from google.colab import drive
# # drive.mount('/gdrive')
# %cd /gdrive/My Drive/IADS-2019-Tree-Codes

# **Contents**

### 1. Implementation of Random Forest for Classification
### 2. Implementation of Random Forest for Regression






# 1. Classification


## Dataset for Classification




> **Dataset:**  [Iris dataset](https://scikit-learn.org/stable/datasets/index.html#iris-dataset).



*   **Number of Instances:** 
    *   150 (50 in each of three classes)
*   **Number of Attributes:**
    *   4 numeric, predictive attributes and the class

*   **Attribute Information:**
    *   sepal length in cm
    *   sepal width in cm
    *   petal length in cm
    *   petal width in cm

*   **Classes:**
    *   Setosa (0)
    *   Versicolour (1)
    *   Virginica (2)
    






In [0]:
# Add liberaries 
from sklearn import datasets  # DATA
from sklearn.model_selection import train_test_split # to Split Train-Test data
from sklearn import ensemble # To get Decision Tree 
from sklearn import metrics # To generate evaluation metrices
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score


from sklearn.tree import export_graphviz # exporting the tree structure as dot file
from pydotplus.graphviz import graph_from_dot_data # export png image from dot file
from IPython.display import Image, SVG # Show the image within colab notebook
from graphviz import Source
import matplotlib.pyplot as plt

import pandas as pd # for basic data manipulations 
import numpy as np



### 1. Load Data

In [0]:
#load data and see meta info
iris = datasets.load_iris()
dir(iris)

### 2. Explore Data


In [0]:
# print type and shape of data
print(type(iris.data))
print(type(iris.target))

print(iris.data.shape)
print(iris.target.shape)

### 3. Create Panda Dataframe and do data manipulations

In [0]:
dfCls = pd.DataFrame(iris.data, columns=iris.feature_names)
dfCls.head()

In [0]:
# Add target data to the panda dataframe
dfCls['target'] = iris.target
dfCls.head()

### 4. Split the data for Training and Testing

In [0]:
X_train, X_test, y_train, y_test = train_test_split(dfCls.drop(['target'],axis='columns'), iris.target, test_size=0.2,random_state=0, stratify=iris.target)
print(X_train.shape)
print(X_test.shape)

### 5. Initialise a Random Forest Classifier

In [0]:
rfClassifier = ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, 
                                               min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                               max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                               min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
                                               random_state=2, verbose=0, warm_start=False, class_weight=None)



> ***Let's dig into*** **[tree.RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)**



### 6. Model Evaluation on Train data

In [0]:
#perform 10 fold cross validation and plot the CM
CV_predicted = cross_val_predict(rfClassifier, X_train, y_train, cv=10) #CV predicted values (training data)
CV_score = cross_val_score(rfClassifier, X_train, y_train, cv=10) #CV model score (training data)

print("Cross validation Score on train data: ",CV_score.mean())
print("\n")

print("Confusion matrix on CV predictions (train data)")
print(metrics.confusion_matrix(y_train, CV_predicted)) # confusion matrix on CV predictions (train data)
print("\n")

print("Classification report CV predictions (train data)")
print(metrics.classification_report(y_train, CV_predicted, target_names=['Setosa', 'Versicolor', 'Virginica'])) # classification report CV predictions (train data)


### 7. Let's fit the RF model on Training data and perform prediction with the Test data 

In [0]:
rfClassMdl = rfClassifier.fit(X_train,y_train)

y_predicted = rfClassMdl.predict(X_test)

### 8. Model Evaluation on Test Data

In [0]:
mdl_score = rfClassMdl.score(X_test,y_test) #model score (test data)
print ("Model Score on test data:",mdl_score)
print("\n")

print("Confusion matrix (test data)")
print(metrics.confusion_matrix(y_test, y_predicted)) #confusion matrix (test data)
print("\n")

print("Classification report (test data)")
print(metrics.classification_report(y_test, y_predicted, target_names=['Setosa', 'Versicolor', 'Virginica'])) # classification report (test data)

### 9. Plot the trained RF structure for visualisation

In [0]:
#Analyse the smallest tree in the forest
size, index = min((estimator.tree_.node_count, idx) for (idx,estimator) in enumerate(rfClassMdl.estimators_))
print(f'The smallest tree has {size} nodes! with {index} index!')

In [0]:
# Plot the smallest tree
graph = Source(export_graphviz(rfClassMdl.estimators_[index], out_file=None, feature_names=X_train.columns, filled=True, class_names=['Setosa', 'Versicolor','Virginica']))
SVG(graph.pipe(format='svg'))

In [0]:
smallest_tree = rfClassMdl.estimators_[index]

smallest_tree.score(X_test,y_test)

In [0]:
from collections import OrderedDict
from sklearn.datasets import make_classification

# Generate a binary classification dataset.
X, y = make_classification(n_samples=500, n_features=25,
                           n_clusters_per_class=1, n_informative=15,
                           random_state=18)

clfs_cluster = [
    ("RandomForestClassifier, max_features='sqrt'",
        ensemble.RandomForestClassifier(n_estimators=100,
                               warm_start=True, oob_score=True,
                               max_features="sqrt",
                               random_state=18)),
    ("RandomForestClassifier, max_features='log2'",
        ensemble.RandomForestClassifier(n_estimators=100,
                               warm_start=True, max_features='log2',
                               oob_score=True,
                               random_state=18)),
    ("RandomForestClassifier, max_features=None",
        ensemble.RandomForestClassifier(n_estimators=100,
                               warm_start=True, max_features=None,
                               oob_score=True,
                               random_state=18))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in clfs_cluster)

# Range of `n_estimators` values to explore.
min_estimators = 20
max_estimators = 180

for label, clf in clfs_cluster:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
plt.figure(figsize=(14,8))
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)
    

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

# 2. Regression 

## Dataset for Regression



> **Dataset:**  [Boston house-price](https://scikit-learn.org/stable/datasets/index.html#boston-dataset)



*   **Number of Instances:** 
    *   	506
*   **Number of Attributes:**
    *   13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

*   **Attribute Information:**
    *   CRIM per capita crime rate by town
    *   ZN proportion of residential land zoned for lots over 25,000 sq.ft.
    *   INDUS proportion of non-retail business acres per town
    *   CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
    *   NOX nitric oxides concentration (parts per 10 million)
    *   RM average number of rooms per dwelling
    *   AGE proportion of owner-occupied units built prior to 1940
    *   DIS weighted distances to five Boston employment centres
    *   RAD index of accessibility to radial highways
    *   TAX full-value property-tax rate per \$10,000
    *   PTRATIO pupil-teacher ratio by town
    *   B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
    *   LSTAT \% lower status of the population
    *   MEDV Median value of owner-occupied homes in \$1000’s








### 1. Load Data

In [0]:
#load data and see meta info
boston = datasets.load_boston()
dir(boston)

### 2. Explore Data

In [0]:
# print type and shape of data
print(type(boston.data))
print(type(boston.target))

print(boston.data.shape)
print(boston.target.shape)

### 3. Create Panda Dataframe and do data manipulations

In [0]:
dfReg = pd.DataFrame(boston.data, columns=boston.feature_names)
dfReg.head()

In [0]:
# Add target data to the panda dataframe
dfReg['target'] = boston.target
dfReg.head()

### 4. Split the data for Training and Testing

In [0]:
X_train, X_test, y_train, y_test = train_test_split(dfReg.drop(['target'],axis='columns'),boston.target,test_size=0.1,random_state=0)
print(X_train.shape)
print(X_test.shape)

### 5. Initialise a Random Forest Regressor

In [0]:
rfRegressor = ensemble.RandomForestRegressor(n_estimators=200, criterion='mse', 
                                             max_depth=None, min_samples_split=2, 
                                             min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                             max_features='auto', max_leaf_nodes=None, 
                                             min_impurity_decrease=0.0, min_impurity_split=None, 
                                             bootstrap=True, oob_score=False, n_jobs=None, 
                                             random_state=2, verbose=0, warm_start=False)



> ***Let's dig into*** **[tree.RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)**






### 6. Model Performance on Train data

In [0]:
CV_predicted = cross_val_predict(rfRegressor, X_train, y_train, cv=10) # CV prediction on Train data
r2_score_cv = cross_val_score(rfRegressor, X_train, y_train, cv=10).mean() # CV model score on Train data
print ("Crossvalidation Coefficient of determination on training set :",r2_score_cv)

In [0]:
# Plot measured values vs predicted values Training Data
plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
plt.scatter(y_train, CV_predicted, edgecolors=(0, 0, 0))
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.title('Measured vs Predicted Values for Training Data (10x CV)')
plt.show()

### 7.  Let's fit the RF model on Training data and perform prediction with the Test data 

In [0]:
rfRegressor_model = rfRegressor.fit(X_train,y_train)

y_predicted = rfRegressor_model.predict(X_test)

### 8. Model Performance on Test data

In [0]:
r2_score = rfRegressor_model.score(X_test,y_test) #R2 Score
print ("Coefficient of determination on test set: ",r2_score)

mse_score = metrics.mean_squared_error(y_test, y_predicted) #Mean Squared Error
print ("\nMean Squared Error on test set :",mse_score)

### 9. Visualisation

In [0]:
# Plot measured values vs predicted values
plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
plt.scatter(y_test, y_predicted, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.title('Measured vs Predicted Values for Test Data')
plt.show()



In [0]:
# Plot a one-to-one comparison of measured values vs predicted values
plt.figure(num=None, figsize=(25, 4), dpi=80, facecolor='w', edgecolor='k')
plt.plot(y_predicted, 'gd-', label='DecisionTreeRegressor')
plt.plot(y_test, 'r*-', label='Actual values')
plt.tick_params(axis='x', which='both', bottom=False, top=False,
                labelbottom=False)
plt.ylabel('Target values')
plt.xlabel('Training samples')
plt.legend(loc="best")
plt.title('Comparison of individual sample predictions')
plt.show()

In [0]:
# #############################################################################
# Plot feature importance
feature_importance = rfRegressor_model.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = range(X_train.shape[1]);

plt.figure(figsize=(10,8))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, boston.feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')

plt.show()