**Decision Tree Classifier**

Library imports

In [1]:
import os
import sys
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

sys.path.insert(0, str(Path.cwd().parent))

Data Imports

In [4]:
from utils.utils import save_experiment, train_and_evaluate_decision_tree, grid_search
from configs.config_local import DATASET_PATH, ITW_DATASET_PATH, FEATURES_DIR

Training / Testing Validation features:
Using mean aggregated, N_MFCC = 20, N_FTT = 128, HOP_LENGTH = 256, N_MELS = 128

In [5]:
train_path = os.path.join(FEATURES_DIR, "training_features_mean_20_128_256_128.parquet")
val_path = os.path.join(FEATURES_DIR, "validation_features_mean_20_128_256_128.parquet")
test_path = os.path.join(FEATURES_DIR, "testing_features_mean_20_128_256_128.parquet")

Training with default hyperparameters

In [4]:
clf, metrics, dt_params, feature_names, metadata_extra = train_and_evaluate_decision_tree(
    train_path=train_path,
    val_path=val_path,
    test_path=None,
)
print(metadata_extra)
print(metrics)

{'train_samples': 53864, 'val_samples': 10797}
{'accuracy': 0.9623969621191072, 'precision': 0.9624331701550768, 'recall': 0.9623973735478988, 'f1': 0.9623962495603613, 'roc_auc': 0.9800186674339881}


In [5]:
clf, metrics, dt_params, feature_names, metadata_extra = train_and_evaluate_decision_tree(
    train_path=train_path,
    val_path=test_path,
    test_path=None,
)
print(metadata_extra)
print(metrics)

{'train_samples': 53864, 'val_samples': 4634}
{'accuracy': 0.8970651704790678, 'precision': 0.8972262590694, 'recall': 0.8973915701271786, 'f1': 0.8970611390103391, 'roc_auc': 0.9139537765949516}


Training with default hyperparameters using Entropy

In [6]:
clf, metrics, dt_params, feature_names, metadata_extra = train_and_evaluate_decision_tree(
    train_path=train_path,
    val_path=val_path,
    test_path=None,
    criterion="entropy"
)
print(metadata_extra)
print(metrics)

{'train_samples': 53864, 'val_samples': 10797}
{'accuracy': 0.9656386033157358, 'precision': 0.9656390394489773, 'recall': 0.9656385601302122, 'f1': 0.9656385927044688, 'roc_auc': 0.9883596862207614}


Training with weighted fake class

In [27]:

params = {
            "max_depth": 11,
            "min_samples_split": 5,
            "min_samples_leaf": 2,
            "max_features": None,
            "random_state": 42,
            "class_weight": {0: 1, 1: 5}
        }

clf, metrics, dt_params, feature_names, metadata_extra = train_and_evaluate_decision_tree(
    train_path=train_path,
    val_path=None,
    test_path=test_path,
    criterion="gini",
    dt_params=params
)
print(metadata_extra)
print(metrics)

{'train_samples': 53864, 'test_samples': 4634}
{'accuracy': 0.9512300388433319, 'precision': 0.9535362348208523, 'recall': 0.9504933205111001, 'f1': 0.9510876794179715, 'roc_auc': 0.9584143109540636}


**Validate using the ITW Dataset**

In [10]:

itw_val_path = os.path.join(ITW_DATASET_PATH, "normalized_features", "itw_features_20_128_256_128_trimmed_loudness_normalized.parquet")

In [8]:
clf, metrics, dt_params, feature_names, metadata_extra = train_and_evaluate_decision_tree(
    train_path=train_path,
    val_path=itw_val_path,
    test_path=None,
    criterion="gini"
)
print(metadata_extra)
print(metrics)

{'train_samples': 53864, 'val_samples': 31526}
{'accuracy': 0.6302734251094335, 'precision': 0.5850650309759771, 'recall': 0.557001562364462, 'f1': 0.545858275611627, 'roc_auc': 0.5750994588912883}


Validation using weighted fake class

In [20]:

params = {
            "max_depth": 5,
            "min_samples_split": 5,
            "min_samples_leaf": 2,
            "max_features": None,
            "random_state": 42,
            "class_weight": {0: 1, 1: 5}
        }

clf, metrics, dt_params, feature_names, metadata_extra = train_and_evaluate_decision_tree(
    train_path=train_path,
    val_path=itw_val_path,
    test_path=None,
    criterion="gini",
    dt_params=params
)
print(metadata_extra)
print(metrics)

{'train_samples': 53864, 'val_samples': 31526}
{'accuracy': 0.6147306984711032, 'precision': 0.5917353206416675, 'recall': 0.59335470705618, 'f1': 0.592342918932419, 'roc_auc': 0.6428275253503631}


**Hyperparameter Grid Search**

In [None]:
params = {
    "max_depth": [x for x in range(5, 20)],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"],
    "class_weight": [{0: 1, 1: 5}, None],
    "criterion": ["gini", "entropy"]
}

model = DecisionTreeClassifier(random_state=42)

clf, metrics, dt_params, feature_names, metadata_extra = grid_search(
    train_path=train_path,
    val_path=val_path,
    test_path=None,
    params=params
)