<a href="https://colab.research.google.com/github/eckoecho/CodingDojo/blob/Model/Random_Forests_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
# Import random forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [25]:
fpath="/content/drive/MyDrive/CodingDojo/02-MachineLearning/Week06/Data/cali_housing.csv"
df = pd.read_csv(fpath)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [26]:
#Train test split
X = df.drop(columns="MedHouseVal")
y = df["MedHouseVal"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
8158,4.2143,37.0,5.288235,0.973529,860.0,2.529412,33.81,-118.12
18368,5.3468,42.0,6.364322,1.08794,957.0,2.404523,37.16,-121.98
19197,3.9191,36.0,6.110063,1.059748,711.0,2.235849,38.45,-122.69
3746,6.3703,32.0,6.0,0.990196,1159.0,2.272549,34.16,-118.41
13073,2.3684,17.0,4.795858,1.035503,706.0,2.088757,38.57,-121.33


# Instantiate Model

In [27]:
# Instantiate a Random Forest model for regression
rf = RandomForestRegressor(random_state=42)

# Fit the Data

In [28]:
# Fit on the training data
rf.fit(X_train, y_train)

# Predict the Values for y

In [29]:
# Making predictions for the y training value based on X
train_preds = rf.predict(X_train)
# Making predictions for y test value based on X
test_preds = rf.predict(X_test)

# Evaluate Model Performance

In [30]:
# Obtain r2 score for train and test
rf_train_tf = r2_score(y_train, train_preds)
rf_test_tf= r2_score(y_test, test_preds)
# Print results
print(f"The training R2 is: {rf_train_tf.round(3)}.")
print(f"The test R2 is: {rf_test_tf.round(3)}.")

The training R2 is: 0.973.
The test R2 is: 0.808.


# Tune the Model

In [31]:
# Looking at some hyperparameters that seem tunable
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

## Tuning the max_depth

In [34]:
# Instaniate a model with max_depth of 9
rf_9 = RandomForestRegressor(max_depth = 9, random_state = 42)
# Fit model on training data
rf_9.fit(X_train, y_train)
# Obtain predictions on training and test data
train_preds = rf_9.predict(X_train)
test_preds = rf_9.predict(X_test)
# Calculate r2 score for training and test data
rf_9_train_score = r2_score(y_train, train_preds)
rf_9_test_score = r2_score(y_test, test_preds)
# Print results
print(f' The training r2 is: {rf_9_train_score.round(3)}.')
print(f' The testing r2 is: {rf_9_test_score.round(3)}.')

 The training r2 is: 0.842.
 The testing r2 is: 0.766.


# Getting Depth from RandomForestRegressor

In [35]:
# Get the depths of each tree in the random forest
est_depths = [estimator.get_depth() for estimator in rf.estimators_]
# Get the maximum depth
max(est_depths)

42

In [None]:
from sklearn.metrics.cluster import rand_score
#Make list of depths to try
depths = range(1, max(est_depths))
# Make dataframe for results
scores = pd.DataFrame(index=depths, columns=["Training Score, Test Score"])
# Try the different values for depths
for depth in depths:
    model = RandomForestRegressor(max_depth = depth, random_state=42)
    # Fit the model on training data
    model.fit(X_train, y_train)
    # Make predictions on training and test data
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    # Obtain scores and add to dataframe
    train_score = r2_score(y_train, train_preds)
    test_score = r2_score(y_test, test_preds)
    #Print results
    scores.loc[depth, train_preds] = train_score
    scores.loc[depth, test_preds] = test_score
    # View first 5 rows of score dataframe
    scores.head()

##Plot the scores

In [None]:
plt.plot(scores["Test Score"])

##Sort Score Values

In [None]:
# Sort by the best score on testing data
sorted_scores = scores.sort_values(by="Test Score", ascending=False)
sorted_scores.head()

# Tuning n_estimators

In [None]:
# Choose values for n_estimators to try
n_ests = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
# Create a dataframe for the results
scores2 = pd.DataFrame(index=n_est,columns=["Train Score", "Test Score"])
# Test the different values for n_estimators
for n in n_est:
    model = RandomForestRegressor(max_depth=29, n_estimators=n, random_state=42)
    # Fit the model on training data
    model.fit(X_train, y_train)
  # Make predictions on  training and testing data
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
  # Obtain scores and add to scores df 
    scores2.loc[n, "Train Score"] = r2_score(y_train, train_preds)
    scores2.loc[n, "Test Score"] = r2_score(y_test, train_preds)
    #List 1st 5 results
    scores2.head()

## Plot the Results

In [None]:
plt.plot(scores["Test Score"])
plt.xticks(n_ests)

## Sort the Results 

In [None]:
sorted_scores2 = scores2.sort_values(by="Test Score", ascending=False)
sorted_scores2.head()