## Supplement 6: Decision Trees and Random Forest

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from scipy.stats import mode


### 6.3 Programming Task: Song popularity prediction using Random Forest
The goal of this task is to train a random forest model that predicts the song popularity using the datasets already provided in task 4.3
 

In [12]:
import pandas as pd

# Read CSV files
train_df = pd.read_csv("train-songs.csv")
test_df = pd.read_csv("test-songs.csv")

train_X = train_df.drop(columns=["popular"])
train_y = train_df["popular"]

test_X = test_df.drop(columns=["popular"])
test_y = test_df["popular"]

   i\. Implement a function that draws a bootstrap sample of size N from the train dataset, where N can be specified by the user.




In [13]:
import numpy as np

def generate_bootstrap(train_X, train_y, N):
    indices = np.random.randint(0, len(train_X), size=N)

    # Create bootstrap samples
    X_bootstrap = train_X.iloc[indices]
    y_bootstrap = train_y.iloc[indices]

    return X_bootstrap, y_bootstrap

   ii\. Complete the implementation of the random forest algorithm. For this task you may use the DecisionTreeClassifier from the scikit-learn library. The other parts of the random forest algorithm must be implemented using only Scipy/Numpy.

In [34]:
from sklearn import tree

class RandomForest:
    
    def __init__(self, n_trees, max_features, max_samples, min_node_size, max_depth):
        """
        n_trees      : Number of decision trees in the ensemble.
        max_features : Number (or range) of features to consider when looking for splits.
                       (Passed directly to DecisionTreeClassifier.)
        max_samples  : Number or fraction of samples used to create each bootstrap sample.
        min_node_size: Passed as min_samples_leaf to DecisionTreeClassifier.
        max_depth    : Maximum depth of each tree.
        """
        self.n_trees = n_trees
        self.max_features = max_features
        self.max_samples = max_samples
        self.min_node_size = min_node_size
        self.max_depth = max_depth
        self.trees = []

    

    def train(self, train_X, train_y):
        """Train each decision tree on a bootstrap sample."""
        for _ in range(self.n_trees):
            # Determine the actual number of samples if max_samples is fractional
            if isinstance(self.max_samples, float):
                n_samples = int(len(train_X) * self.max_samples)
            else:
                n_samples = self.max_samples

            X_boot, y_boot = generate_bootstrap(train_X, train_y, n_samples)

            """# Create a weak classifier (decision tree)
            tree = DecisionTreeClassifier(
                max_features=self.max_features,
                max_depth=self.max_depth,
                min_samples_leaf=self.min_node_size
            )"""
            
            tree_a = tree.DecisionTreeClassifier(
                max_features=self.max_features,
                max_depth=self.max_depth,
                min_samples_leaf=self.min_node_size
            )


            # Train the tree on the bootstrap sample
            tree_a.fit(X_boot, y_boot)
            self.trees.append(tree_a)

    def predict(self, test_X):
        """Predict by majority vote across all trained trees."""
        # Collect predictions from each tree
        all_preds = np.array([tree.predict(test_X) for tree in self.trees])  # shape: (n_trees, n_samples)
  # shape: (n_trees, n_samples)

        final_preds = []
        # For each sample, compute the majority class
        for i in range(test_X.shape[0]):
            values, counts = np.unique(all_preds[:, i], return_counts=True)
            majority_class = values[np.argmax(counts)]
            final_preds.append(majority_class)

        return np.array(final_preds), final_preds, 

iii\. Train the model for the dataset from train-songs.csv using the parameters given below.
| Parameter| Value|
|----------|------|
Number of trees|100|
Maximum features per tree|2|
Bootstrap sample size|20000|
Minimum node size|1|
Maximum tree depth|10|


Note: The bootstrap sample size is the same as train dataset size in this task.


In [35]:
# Instantiate the RandomForest model with the specified parameters
random_forest_model = RandomForest(
    n_trees=100,       # Number of decision trees
    max_features=2,    # Maximum features considered per split
    max_samples=20000, # Size of each bootstrap sample
    min_node_size=1,   # Minimum number of samples in a leaf node
    max_depth=10       # Maximum tree depth
)

# Train the model on training data
random_forest_model.train(train_X, train_y)




   iv\. Calculate the accuracy of the model using the test dataset and compare your results with the
RandomForestClassifier from the scikit-learn library using the following parameters.

In [36]:
import numpy as np

def train_and_evaluate_rf(train_X, train_y, test_X, test_y):
    """
    Defines, trains, and evaluates a custom RandomForest model 
    on the given train/test splits. 
    Prints accuracy without using scikit-learn.
    """
    # Instantiate the RandomForest with desired hyperparameters
    rf_custom = RandomForest(
        n_trees=100,
        max_features=2,
        max_samples=20000,
        min_node_size=1,
        max_depth=10
    )

    # Train the model
    rf_custom.train(train_X, train_y)

    # Predict on the test set
    y_pred_custom = rf_custom.predict(test_X)

    # Convert test labels to NumPy array if needed
    y_true = test_y.values  # or test_y.to_numpy()

    # Calculate accuracy using basic NumPy operations
    accuracy_custom = np.mean(y_true == y_pred_custom)

    print(f"Custom RandomForest Accuracy: {accuracy_custom:.4f}")



train_and_evaluate_rf(train_X, train_y, test_X, test_y)

Custom RandomForest Accuracy (no scikit-learn): 0.8045


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


rf_sklearn = RandomForestClassifier(
    n_estimators=100,        
    max_features=2,          
    max_depth=10,            
    min_samples_leaf=1,      
    bootstrap=True,          
    random_state=42          
)


rf_sklearn.fit(train_X, train_y)


y_pred_sklearn = rf_sklearn.predict(test_X)


rf_custom = RandomForest(
    n_trees=100,
    max_features=2,
    max_samples=20000,
    min_node_size=1,
    max_depth=10
)

accuracy_sklearn = accuracy_score(test_y, y_pred_sklearn)
print(f"Scikit-learn RandomForestClassifier Accuracy: {accuracy_sklearn:.4f}")

rf_custom.train(train_X, train_y)  
y_pred_custom = rf_custom.predict(test_X)  



accuracy_custom = np.mean(test_y.values == y_pred_custom)  
print(f"Custom RandomForest Accuracy: {accuracy_custom:.4f}")


if accuracy_sklearn > accuracy_custom:
    print("Scikit-learn's implementation outperforms the custom RandomForest.")
elif accuracy_sklearn < accuracy_custom:
    print("The custom RandomForest outperforms scikit-learn's implementation.")
else:
    print("Both implementations have the same accuracy.")

Scikit-learn RandomForestClassifier Accuracy: 0.8070
Custom RandomForest Accuracy: 0.8045
Scikit-learn's implementation outperforms the custom RandomForest.
