In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing	import OneHotEncoder
from datetime import datetime
import pickle
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.tree import DecisionTreeClassifier

load_dotenv()
os.chdir(os.getenv("ROOT_DIR"))

from src import utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- PARAMS --- #
prep_id = 'prep_04'


In [3]:
# --- LOAD --- #
x_train, y_train, x_test, y_test = utils.get_prep_df(prep_id)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(712, 14) (712, 1) (179, 14) (179, 1)


In [4]:
############################################
### DECISION-TREE-GDOC
### sklearn.tree.DecisionTreeClassifier
### sklearn v.1.3.2.
############################################

### Main ref ----------------------------------------------------------------
# - https://scikit-learn.org/stable/modules/tree.html#tree

### Main def --------------------------------
# Decision Trees (DTs) are a non-parametric supervised learning method 
# used for classification and regression. The goal is to create a model 
# that predicts the value of a target variable by learning simple decision 
# rules inferred from the data features. A tree can be seen as a 
# piecewise constant approximation.

### Main advantages --------------------------------
# - Simple to understand and to interpret. Trees can be visualized. White box model.
# - Fast: The cost of using the tree (i.e., predicting data) is logarithmic 
# in the number of data points used to train the tree.

### Main disadvantages --------------------------------
# - Decision-tree learners can create over-complex trees that do not 
# generalize the data well (overfitting).
# - Decision trees can be unstable because small variations in the 
# data might result in a completely different tree being generated. 
# This problem is mitigated by using decision trees within an ensemble.
# - The problem of learning an optimal decision tree is known to be NP-complete 
# under several aspects of optimality and even for simple concepts. 
# Consequently, practical decision-tree learning algorithms are based on 
# heuristic algorithms such as the greedy algorithm where locally optimal 
# decisions are made at each node. Such algorithms cannot guarantee to 
# return the globally optimal decision tree. This can be mitigated by 
# training multiple trees in an ensemble learner, where the features 
# and samples are randomly sampled with replacement.
# - Decision tree learners create biased trees if some classes dominate. 
# It is therefore recommended to BALANCE the dataset prior 
# to fitting with the decision tree.

### Implementation Highlights -------------------------------------------------
# - DecisionTreeClassifier is a class capable of performing multi-class 
# classification on a dataset
# - the scikit-learn implementation does not support categorical variables.

### Signature ----------------------------------------------------------------
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

#DecisionTreeClassifier(
# criterion='gini', # metric to define que quality of a split.
# splitter='best', # The best possible split is used or we use a random split?
# max_depth=None, #!key to control overfitting
# min_samples_split=2, 
# min_samples_leaf=1, #!key to create asymmetric trees. 
# min_weight_fraction_leaf=0.0, 
# max_features=None, # how many features consider when lookint for the best split. See doc.
# random_state=None, 
# max_leaf_nodes=None, 
# min_impurity_decrease=0.0, 
# class_weight=None, #!  dict, list of dict or “balanced”, default=None
# ccp_alpha=0.0
#)

#DecisionTreeClassifier(
# criterion='gini',
# splitter='best', 
# max_depth=None, 
# min_samples_split=2, 
# min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, 
# max_features=None, 
# random_state=None, 
# max_leaf_nodes=None, 
# min_impurity_decrease=0.0, 
# class_weight=None, 
# ccp_alpha=0.0
#)

### Hyperparams Comments  --------------------------------
# - The max_depth hyperparameter controls the overall complexity of the tree. 
# This parameter is adequate under the assumption that a tree is built symmetrically. 
# However, there is no reason why a tree should be symmetrical. Indeed, optimal generalization 
# performance could be reached by growing some of the branches deeper than some others.

# The hyperparameters min_samples_leaf, min_samples_split, 
# max_leaf_nodes, or min_impurity_decrease allow growing asymmetric trees and apply 
# a constraint at the leaves or nodes level. 
# ref. https://inria.github.io/scikit-learn-mooc/python_scripts/trees_hyperparameters.html

In [5]:
# --- TUNNING ---
# --- Objective function: define the optimization metrics ---
# (where, for what model, what metric)

def objective(trial, x, y):
	
				# define search space
				params = {
								'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
								'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
								'max_depth': trial.suggest_int('max_depth', 1, 10),
								'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
								'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
								'random_state': 0
				}

				# define model
				clf	= DecisionTreeClassifier(**params)

				# Eval strategy
				# It gives me the list of k scores from cv
				cv_scores = cross_val_score(clf, x, y, cv=3, scoring='accuracy')

				obj = cv_scores.mean()
				
				# return score for each trial
				return obj

w_func = lambda trial: objective(trial, x=x_train, y=y_train) # just a wrapper	to pass x and y

# --- study: define searching strategy (How) ---

# sampler=TPESampler(): Optuna uses the Tree-structured Parzen Estimator (TPE) 
# sampler, which is a Bayesian optimization method that efficiently searches 
# through the hyperparameter space.

# pruner=MedianPruner(): Optuna can prune unpromising trials early, based on the median 
# value of intermediate results. This speeds up the optimization process by discarding 
# poor-performing hyperparameter combinations early.

study=optuna.create_study(
	sampler= optuna.samplers.TPESampler(),
	pruner=optuna.pruners.MedianPruner(),
	direction='maximize'
	)

study.set_metric_names(["accuracy"])


# -- Start optimizing ---
study.optimize(
	func=w_func, 
	n_trials=100,
	timeout=None, # max time in seconds
	n_jobs=-1 # max job  in parallel. -1 = all cpus
	)

[I 2024-10-23 06:08:40,112] A new study created in memory with name: no-name-95201e18-fdbd-442d-999c-f85bf5774972
  study.set_metric_names(["accuracy"])
[I 2024-10-23 06:08:40,291] Trial 2 finished with value: {'accuracy': 0.7711000011819075} and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 9, 'min_samples_split': 17, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.7711000011819075.
[I 2024-10-23 06:08:40,303] Trial 10 finished with value: {'accuracy': 0.7767022420782658} and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 9, 'min_samples_split': 27, 'min_samples_leaf': 23}. Best is trial 10 with value: 0.7767022420782658.
[I 2024-10-23 06:08:40,354] Trial 1 finished with value: {'accuracy': 0.7724946518691865} and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 7, 'min_samples_split': 31, 'min_samples_leaf': 14}. Best is trial 10 with value: 0.7767022420782658.
[I 2024-10-23 06:08:40,387] Trial 6 finished with val

In [6]:
# save tunning results
utils.tunning_results(study, os.getenv("ARTIFACTS_PATH"))

Results saved in artifacts/model_04/tunning


{'best_params': {'criterion': 'entropy',
  'splitter': 'best',
  'max_depth': 4,
  'min_samples_split': 9,
  'min_samples_leaf': 10},
 'best_value': 0.8047961800754058,
 'metric_names': ['accuracy']}

In [7]:
# --- TRAIN ---
x_full = pd.concat([x_train, x_test])
y_full = pd.concat([y_train, y_test])

best_params = study.best_params
clf = DecisionTreeClassifier(**best_params)
model = clf.fit(x_full, y_full)

# --- save as pickle ---
artifact_path = os.path.join(os.getenv("ARTIFACTS_PATH"), utils.get_nb_name(), 'model')
os.makedirs(artifact_path, exist_ok=True)

with open(os.path.join(artifact_path, 'model.pkl'), 'wb') as f:
    pickle.dump(model, f)

print(f"Model saved at: {os.path.join(artifact_path, 'model.pkl')}")
print(f'Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')


Model saved at: artifacts/model_04/model/model.pkl
Timestamp: 2024-10-23 06:08:58
