Import packages and data
Use Iris data as an example

In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target


Split data into training and testing dataset

In [None]:
# Split the data - 80% training, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Initialize the Random Forest model

In [None]:
# Initialize the Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)




Training

In [None]:
# Train the model
rf_clf.fit(X_train, y_train)


Predict (Testing)

In [None]:
# Predict on the test set
y_pred = rf_clf.predict(X_test)


Validation (Evaulation) of the model

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Now, let's check what you can play with the Random Forest.

What is covered in the lecture?

1. Number of trees in the forest
2. Depth of each tree
3. How many samples in each tree
4. How many features in each tree
5. Node splitting method and number of samples in a node

There are some more parameters that you can change. You can explore by yourself.


In [None]:
# Set 200 individual trees
#rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Set the maximum depth of each tree to 3
#rf_clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)

# Set criterion, default = gini, and you can choose information gain
#rf_clf = RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=42)

# Set minimum samples to split in a node, default = 2
#rf_clf = RandomForestClassifier(n_estimators=100, min_samples_split=5, random_state=42)

# Set min_samples_leaf is similar to minimum samples to split at node, to avoid overfitting, default = 1
#rf_clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5, random_state=42)

# Set max_leaf_nodes is similar, to avoid overfitting, default no limit
#rf_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=5, random_state=42)

# Set max features in a tree, default = sqrt(total features)
#rf_clf = RandomForestClassifier(n_estimators=100, max_features='log2', random_state=42)

# Set max samples in a tree, default same as the original data set size
#rf_clf = RandomForestClassifier(n_estimators=100, max_samples=100, random_state=42)

# Warm start means you are NOT starting from nothing, but USING the PREVIOUS result to train!
# That means you add more trees to do the training (same as increasing the n_estimators at first)
#rf_clf = RandomForestClassifier(n_estimators=100, warm_start='True', random_state=42)

# n_jobs means ask how many computer cores to run the job. n_jobs = '-1' means use all cores. default = 'None'
rf_clf = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state=42)


Draw the trees

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Select a tree from the random forest
tree = rf_clf.estimators_[0]  # Selecting the first tree as an example

# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=iris.feature_names, class_names=iris.target_names.tolist(), filled=True)
plt.show()


Which features are important?

In [None]:
feature_importances = rf_clf.feature_importances_

import numpy as np
import matplotlib.pyplot as plt

# Get feature names
feature_names = iris.feature_names

# Sort the feature importances in descending order
sorted_idx = np.argsort(feature_importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), feature_importances[sorted_idx], align='center')
plt.xticks(range(X.shape[1]), np.array(feature_names)[sorted_idx], rotation=45)
plt.title('Feature Importance in Random Forest')
plt.show()
