# Project 1 – Decision Trees and Random Forests

In [61]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [62]:
data = np.genfromtxt("wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)


In [63]:
# Which hyperparameters should you tune?

""" I think the max_depth, n_estimators(how many trees), criterion, max_features """

# Which values should you test for each hyperparameter?

"""max_depth: 1 to 15?,    n_estimators: 1 to 200,   criterion: entropy or gini,  max_features: sqrt or log2 or None"""

# Which model selection method should you use (e.g., hold-out validation, k-fold cross-validation)?

"""k-fold cross-validation: This is ideal because it splits the dataset into k subsets (e.g., k=5), trains the model on k-1 folds, and tests on the remaining fold. The process is repeated k times with different folds. This helps to reduce the risk of overfitting to a particular subset of data.

Why: It ensures that your model isn't overly tuned to any specific training or testing split and provides a more generalized measure of performance."""

# Which performance measure should you use for model selection (e.g., accuracy, F1-score)?

"""Accuracy is fine becuase i think that there wont be class imbalance in the classes, if it were we should use F1-score. Try both"""

# How do you ensure that your model selection process is fair and unbiased?

"""follow best practices in data splitting, hyperparameter tuning, and evaluation. Avoid Data Leakage. Avoid Overfitting. Ensure Class Balance """

# How can you ensure reproducibility of your results?

"""Set Random Seeds. Create a requirements.txt. Document Hyperparameters. Use Version Control. """


'Set Random Seeds. Create a requirements.txt. Document Hyperparameters. Use Version Control. '

In [76]:
# Set seed 
seed = 0

np.random.seed(seed)


#Split the data

X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, test_size=0.3, random_state=seed, shuffle=True)


kf = KFold(n_splits=5, shuffle=True, random_state=seed)

# Find best values for hyperparameters



max_depth_params = [3,4,5,6,7,8,9,10,11,12,13,14,15,None]

n_estimators =[2,4,8,10,20,50,100,500]

accuracies = []

for el in n_estimators:

    rf = RandomForest(n_estimators=el, max_depth=10, criterion="gini", max_features="sqrt")

    fold_accuracies = []
    for train_index, val_index in kf.split(X_train_val):
        
        X_training, X_val = X_train_val[train_index], X_train_val[val_index]
        y_training, y_val = y_train_val[train_index], y_train_val[val_index]
        
        rf.fit(X_training, y_training)
        
        
        y_pred = rf.predict(X_val)
        
        accuracy = accuracy_score(y_val, y_pred)
        fold_accuracies.append(accuracy)
        
        # print(f"Fold accuracy: {accuracy}")

    # Average accuracy across all folds
    average_accuracy = np.mean(fold_accuracies)
    if (el is None):
        accuracies.append([average_accuracy, el])
    else:
        accuracies.append([average_accuracy, int(el)])
    print(f"Average accuracy: {average_accuracy} for max_depth: {el}")



accuracies = np.array(accuracies)
best_acc = np.argmax(accuracies[:,0])
best_max_depth = accuracies[best_acc]

print("\n")
print(f"The best accuracy is {best_max_depth[0]} with max_depth at {best_max_depth[1]}")
print("\n")




Average accuracy: 0.9142857142857143 for max_depth: 2
Average accuracy: 0.9257142857142858 for max_depth: 4
Average accuracy: 0.9342857142857144 for max_depth: 8
Average accuracy: 0.9342857142857144 for max_depth: 10
Average accuracy: 0.9457142857142857 for max_depth: 20
Average accuracy: 0.9342857142857144 for max_depth: 50
Average accuracy: 0.9428571428571428 for max_depth: 100
Average accuracy: 0.9428571428571428 for max_depth: 500


The best accuracy is 0.9457142857142857 with max_depth at 20.0




In [65]:
# Compare to sklearn
# RandomForestClassifier(n_estimators=10, max_depth=5, criterion="entropy", max_features="sqrt")

# fold_accuracies = []
# for train_index, val_index in kf.split(X_train):
    
#     X_training, X_val = X_train[train_index], X_train[val_index]
#     y_training, y_val = y_train[train_index], y_train[val_index]
    
#     rf.fit(X_training, y_training)
    
    
#     y_pred = rf.predict(X_val)
    
#     accuracy = accuracy_score(y_val, y_pred)
#     fold_accuracies.append(accuracy)
    
#     print(f"Fold accuracy: {accuracy}")

# # Average accuracy across all folds
# average_accuracy = np.mean(fold_accuracies)
# print(f"Average accuracy: {average_accuracy}")