# Decision Tree Classifier

In [1]:
# adding the project root inside the python path
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

In [2]:
# The path where the dataset are stored
DATASET_PATH: str = "../../dataset/first_disease_sel/"
DISEASE_COLNAME: str = 'DISEASE'

## Executing the pipeline

In [3]:
from analysis.preprocess import PreprocessPipeline

pipeline = PreprocessPipeline(
    datasets_path=DATASET_PATH,
    disease_col_name=DISEASE_COLNAME
)
pipeline.execute_pipeline()

INFO:root:Starting pipeline
INFO:root:Loading datasets
INFO:root:Inspecting directory ../../dataset/first_disease_sel/GS
INFO:root:Setting disease as GS
INFO:root:Loading file ../../dataset/first_disease_sel/GS/eGSE117146m.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/MCM
INFO:root:Setting disease as MCM
INFO:root:Loading file ../../dataset/first_disease_sel/MCM/MCM_GSE149607.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/NALD
INFO:root:Setting disease as NALD
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE85804m.csv
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE34308m.csv
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE117647m.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/A1A
INFO:root:Setting disease as A1A
INFO:root:Loading file ../../dataset/first_disease_sel/A1A/A1A_deficiency_GSE109516_pbmc.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/DIABETE
INFO

# Building the Decision Tree Model

## Grid Search

In [4]:
from analysis.classifiers.dt import build_parameters

hyper_params = build_parameters(
    train_x=pipeline.train_x,
    train_y=pipeline.train_y,
)

INFO:root:Executing Grid Search for Decision Tree
480 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "/home/federicosilvestri/Projects/Vedrai/chl-project/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/federicosilvestri/Projects/Vedrai/chl-project/.venv/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 969, in fit
    super().fit(
  File "/home/federicosilvestri/Projects/Vedrai/chl-project/.venv/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 247, in fit
    check_scalar(
  File "

INFO:root:Report of GridSearch
INFO:root:     rank_test_accuracy  mean_test_accuracy  mean_train_accuracy  \
196                   1            0.742599             0.965718   
57                    2            0.722542             0.894658   
199                   3            0.722429             0.933947   
230                   4            0.722203             0.907183   
38                    5            0.719209             0.944815   
..                  ...                 ...                  ...   
81                  476                 NaN                  NaN   
80                  477                 NaN                  NaN   
160                 478                 NaN                  NaN   
85                  479                 NaN                  NaN   
0                   480                 NaN                  NaN   

     std_test_accuracy  std_train_accuracy  mean_fit_time  \
196           0.040948            0.007202       0.059422   
57            0.0483

## Building the model

In [5]:
from analysis.classifiers.dt import build_model

decision_tree = build_model(
    train_x=pipeline.train_x,
    train_y=pipeline.train_y,
    best_params=hyper_params
)
decision_tree

## Evaluating the model

In [8]:
from analysis.classifiers.dt import evaluate_model

In [9]:
evaluation = evaluate_model(
    decision_tree=decision_tree,
    test_x=pipeline.test_x,
    test_y=pipeline.test_y
)
print(evaluation)

              precision    recall  f1-score   support

         A1A       0.94      0.97      0.95        32
     DIABETE       0.71      0.77      0.74        44
          GS       0.12      0.14      0.13         7
         MCM       0.75      0.43      0.55         7
        NALD       0.29      0.20      0.24        10

    accuracy                           0.71       100
   macro avg       0.56      0.50      0.52       100
weighted avg       0.70      0.71      0.70       100



In [10]:
from sklearn import tree

In [13]:
tree.export_graphviz(decision_tree, filled=True, out_file="tree.dot")