# Phase 3: Classification with Decision Trees (Inspired by Lab Guide)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

In [None]:
# Load dataset (Iris Dataset assumed)
data = pd.read_csv("iris.csv")  # Replace with actual file if different

In [None]:
# Feature and label selection
X = data.iloc[:, :-1]  # First 4 columns: features
y = data.iloc[:, -1]   # Last column: label

In [None]:
# Define function to evaluate model
def evaluate_model(X_train, X_test, y_train, y_test, criterion):
    clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"Criterion: {criterion}, Accuracy: {acc:.4f}")
    ConfusionMatrixDisplay(cm, display_labels=clf.classes_).plot()
    plt.title(f"Confusion Matrix ({criterion})")
    plt.show()
    return clf, acc

In [None]:
# Try three different train/test splits
splits = [(0.9, 0.1), (0.8, 0.2), (0.7, 0.3)]
results = []

for train_size, test_size in splits:
    print(f"\n--- Train/Test Split: {int(train_size*100)}/{int(test_size*100)} ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=42)
    for criterion in ["gini", "entropy"]:
        clf, acc = evaluate_model(X_train, X_test, y_train, y_test, criterion)
        results.append({"Train Size": train_size, "Test Size": test_size, "Criterion": criterion, "Accuracy": acc})

In [None]:
# Display summary table
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df)

In [None]:
# Train final model (on 70/30 split with 'entropy') for saving/loading
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)
final_model = DecisionTreeClassifier(criterion="entropy", random_state=42)
final_model.fit(X_train, y_train)

In [None]:
# Save the model
joblib.dump(final_model, "decision_tree_model.pkl")

In [None]:
# Load the model and predict a new sample
loaded_model = joblib.load("decision_tree_model.pkl")
new_sample = [[5.1, 3.5, 1.4, 0.2]]  # Example sample
prediction = loaded_model.predict(new_sample)
print("\nPredicted class for new sample:", prediction[0])

In [None]:
# Visualize the decision tree
plt.figure(figsize=(12, 8))
plot_tree(final_model, feature_names=X.columns, class_names=final_model.classes_, filled=True)
plt.title("Final Decision Tree (Entropy, 70/30)")
plt.show()