# Project 1 – Decision Trees and Random Forests

In [57]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [58]:
data = np.genfromtxt("wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)


In [59]:
# Which hyperparameters should you tune?

""" I think the max_depth, n_estimators(how many trees), criterion, max_features """

# Which values should you test for each hyperparameter?

"""max_depth: 1 to 15?,    n_estimators: 1 to 200,   criterion: entropy or gini,  max_features: sqrt or log2 or None"""

# Which model selection method should you use (e.g., hold-out validation, k-fold cross-validation)?

"""k-fold cross-validation: This is ideal because it splits the dataset into k subsets (e.g., k=5), trains the model on k-1 folds, and tests on the remaining fold. The process is repeated k times with different folds. This helps to reduce the risk of overfitting to a particular subset of data.

Why: It ensures that your model isn't overly tuned to any specific training or testing split and provides a more generalized measure of performance."""

# Which performance measure should you use for model selection (e.g., accuracy, F1-score)?

"""Accuracy is fine becuase i think that there wont be class imbalance in the classes, if it were we should use F1-score. Try both"""

# How do you ensure that your model selection process is fair and unbiased?

"""follow best practices in data splitting, hyperparameter tuning, and evaluation. Avoid Data Leakage. Avoid Overfitting. Ensure Class Balance """

# How can you ensure reproducibility of your results?

"""Set Random Seeds. Create a requirements.txt. Document Hyperparameters. Use Version Control. """


'Set Random Seeds. Create a requirements.txt. Document Hyperparameters. Use Version Control. '

In [60]:
# Set seed 
seed = 0

np.random.seed(seed)


#Split the data

X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, test_size=0.3, random_state=seed, shuffle=True)


kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [61]:
# Find best values for hyperparameters

max_depth_params = [3,10,15,20,25,None]

n_estimators =[2,4,10,20,50,100]

criterion = ["gini", "entropy"]

max_features = ["sqrt", "log2", None]

best_accuracy = [[0]]

for maxdp in max_depth_params:

    for n_est in n_estimators:

        for crit in criterion:

            for mf in max_features:

                rf = RandomForest(n_estimators=n_est, max_depth=maxdp, criterion=crit, max_features=mf)

                fold_accuracies = []
                for train_index, val_index in kf.split(X_train_val):
                    
                    X_training, X_val = X_train_val[train_index], X_train_val[val_index]
                    y_training, y_val = y_train_val[train_index], y_train_val[val_index]
                    
                    rf.fit(X_training, y_training)
                    
                    
                    y_pred = rf.predict(X_val)
                    
                    accuracy = accuracy_score(y_val, y_pred)
                    fold_accuracies.append(accuracy)
                    
                    

                # Average accuracy across all folds
                average_accuracy = np.mean(fold_accuracies)

                if average_accuracy > best_accuracy[0]:
                    best_accuracy = [average_accuracy, n_est, maxdp, crit, mf]
                # print(f"Average accuracy: {average_accuracy}. N_estimators: {n_est}. Max_features: {mf}. Criterion: {crit}.")






print(best_accuracy)


print("\n")
print(f"The best average accuracy is {best_accuracy[0]} with N_estimators: {best_accuracy[1]}. Max_depth: {best_accuracy[2]}. Criterion: {best_accuracy[3]}. Max_features: {best_accuracy[4]}.")
print("\n")







[np.float64(0.9628571428571429), 100, 20, 'entropy', 'sqrt']


The best accuracy is 0.9628571428571429 with N_estimators: 100. Max_depth: 20. Criterion: entropy. Max_features: sqrt.




In [106]:
# Testing the hyperparameters on test set

rf = RandomForest(n_estimators=best_accuracy[1], max_depth=best_accuracy[2], criterion=best_accuracy[3], max_features=best_accuracy[4])

rf.fit(X_train_val,y_train_val)

print(f"Training and validation accuracy: {accuracy_score(y_train_val, rf.predict(X_train_val))}")
print(f"Test accuracy: {accuracy_score(y_test, rf.predict(X_test))}")


Training and validation accuracy: 0.94
Test accuracy: 0.8666666666666667


In [72]:
# Finding hyperparameters for sklearns randomforestclassifier

max_depth_params = [3,10,15,20,25,None]

n_estimators =[2,4,10,20,50,100]

criterion = ["gini", "entropy"]

max_features = ["sqrt", "log2", None]

best_accuracy = [[0]]

for maxdp in max_depth_params:

    for n_est in n_estimators:

        for crit in criterion:

            for mf in max_features:

                rf = RandomForestClassifier(n_estimators=n_est, max_depth=maxdp, criterion=crit, max_features=mf)

                fold_accuracies = []
                for train_index, val_index in kf.split(X_train_val):
                    
                    X_training, X_val = X_train_val[train_index], X_train_val[val_index]
                    y_training, y_val = y_train_val[train_index], y_train_val[val_index]
                    
                    rf.fit(X_training, y_training)
                    
                    
                    y_pred = rf.predict(X_val)
                    
                    accuracy = accuracy_score(y_val, y_pred)
                    fold_accuracies.append(accuracy)
                    
                    

                # Average accuracy across all folds
                average_accuracy = np.mean(fold_accuracies)

                if average_accuracy > best_accuracy[0]:
                    best_accuracy = [average_accuracy, n_est, maxdp, crit, mf]
                # print(f"Average accuracy: {average_accuracy}. N_estimators: {n_est}. Max_features: {mf}. Criterion: {crit}.")






print(best_accuracy)


print("\n")
print(f"The best average accuracy for sklearn is {best_accuracy[0]} with N_estimators: {best_accuracy[1]}. Max_depth: {best_accuracy[2]}. Criterion: {best_accuracy[3]}. Max_features: {best_accuracy[4]}.")
print("\n")

[np.float64(0.8914285714285715), 50, 25, 'gini', 'log2']


The best average accuracy is 0.8914285714285715 with N_estimators: 50. Max_depth: 25. Criterion: gini. Max_features: log2.




In [101]:
# Testing the hyperparameters on test set

rf = RandomForestClassifier(n_estimators=best_accuracy[1], max_depth=best_accuracy[2], criterion=best_accuracy[3], max_features=best_accuracy[4])

rf.fit(X_train_val,y_train_val)

print(f"Training and validation accuracy: {accuracy_score(y_train_val, rf.predict(X_train_val))}")
print(f"Test accuracy: {accuracy_score(y_test, rf.predict(X_test))}")

Training and validation accuracy: 1.0
Test accuracy: 0.9
