# Supervised learning - Python - random forest

In [1]:
# random foreste is part of the "ensemble learning"
# ensemble learning : take the same algo multiple times to make it more powerful

# step 1 : pick at random K data points from the training set
# step 2 : build decision tree associated to these K data points
# step 3 : choose the number Ntree of trees we want to build and repeat steps 1 and 2
# step 4 : for a new data point, make each one of our Ntree trees predict the value of Y for the data point in question and
#           assign the new data point to the average across all of the predicted Y values
# => eg: we predict like 500 predictions and we take their average. This improves the accuracy because of averages
# "ensemble" suffers less from a particular bad result since the result is part of a group of other results that will help dealing with it

In [2]:
import pandas as pd

dataset = pd.read_csv('data/position_salaries.csv')
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,Position,Level,Salary
0,Senior Partner,8,300000
1,Partner,7,200000
2,C-level,9,500000
3,Junior Consultant,2,50000
4,Senior Consultant,3,60000


In [3]:
X = dataset.iloc[:, 1:2].values
X

array([[ 8],
       [ 7],
       [ 9],
       [ 2],
       [ 3],
       [ 4],
       [ 1],
       [ 6],
       [10],
       [ 5]])

In [4]:
y = dataset.iloc[:, 2].values
y

array([ 300000,  200000,  500000,   50000,   60000,   80000,   45000,
        150000, 1000000,  110000])

In [5]:
from sklearn.ensemble import RandomForestRegressor

# n_estimators : number of trees in the forest
regressor = RandomForestRegressor(n_estimators=300)
regressor.fit(X, y)

# there is a clear accuracy difference between 10, 100 and 300 trees
# what's the fallback of having a high number of trees, computing power with huge dataset ?

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [6]:
data = np.array([6.5])

y_pred = regressor.predict(data.reshape(-1, 1))
y_pred

NameError: name 'np' is not defined

In [None]:
%matplotlib inline

import numpy as np

X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid), 1)

import matplotlib.pyplot as plt

plt.scatter(X, y, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Random forest regression - Salary level')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# we have more intervals (splits) because we have more trees that produce more average
# we cannot add an infinite number of trees and expect an infinite number of intervals, at some point they the averages converges
# for example, 10 trees give the same visual results as 100 trees