# Decision Tree Classifier Experiments on Dermatology Dataset

In this section, we will:
- Load the prepared train/test splits from the data preparation notebook
- Train a Decision Tree Classifier (using information gain/entropy) for each split
- Visualize each resulting tree using Graphviz
- Evaluate each model on its test set using classification metrics and confusion matrices

This workflow enables a comprehensive comparison of model performance across different train/test proportions.

In [None]:
# Imports
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz

In [None]:
# Load prepared data splits
%run 03_additional_dataset_preparation.ipynb


try:
    _ = feature_train_40_60
except NameError:
    raise RuntimeError("Please run the data preparation notebook first to define the train/test split variables in memory.")

splits = [
    ("40/60", feature_train_40_60, feature_test_40_60, label_train_40_60, label_test_40_60),
    ("60/40", feature_train_60_40, feature_test_60_40, label_train_60_40, label_test_60_40),
    ("80/20", feature_train_80_20, feature_test_80_20, label_train_80_20, label_test_80_20),
    ("90/10", feature_train_90_10, feature_test_90_10, label_train_90_10, label_test_90_10),
]

In [None]:
# Train, visualize, and evaluate Decision Tree for each split
for ratio, X_train, X_test, y_train, y_test in splits:
    print(f"\n=== Decision Tree for {ratio} split ===")
    clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
    clf.fit(X_train, y_train)
    
    # Visualize the tree using Graphviz
    dot_data = export_graphviz(
        clf, out_file=None, 
        feature_names=X_train.columns,
        class_names=[str(cls) for cls in sorted(y_train.unique())],
        filled=True, rounded=True, special_characters=True
    )
    graph = graphviz.Source(dot_data)
    display(graph)
    
    # Predict and evaluate
    y_pred = clf.predict(X_test)
    print(f"\nClassification Report for ({ratio} split):")
    print(classification_report(y_test, y_pred))
    print(f"Crude Confusion matrix for ({ratio} split):")
    print(confusion_matrix(y_test, y_pred))
    
    # Colorful Confusion Matrix Visualization
    fig, ax = plt.subplots(figsize=(6, 5))
    cm = confusion_matrix(y_test, y_pred)
    labels = [str(cls) for cls in sorted(y_test.unique())]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, ax=ax,
                xticklabels=labels, yticklabels=labels, linewidths=2, linecolor='black', square=True)
    ax.set_xlabel('Predicted label', fontsize=12)
    ax.set_ylabel('True label', fontsize=12)
    ax.set_title(f'Heatmap Confusion matrix ({ratio} split)', fontsize=14)
    cbar = ax.collections[0].colorbar
    cbar.set_label('Number of samples', rotation=270, labelpad=15)
    plt.show()