# Phase 3: Classification with Decision Trees (Inspired by Lab Guide)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

In [None]:
df = pd.read_csv('cleaned_data (1).csv')
print(df.head())

In [None]:
fn =df.keys().tolist() [:-1]
x=df[fn]
y=df['Accident_Severity']

In [None]:

def evaluate_model(X_train, X_test, y_train, y_test, criterion):
    clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"Criterion: {criterion}, Accuracy: {acc:.4f}")
    ConfusionMatrixDisplay(cm, display_labels=clf.classes_).plot()
    plt.title(f"Confusion Matrix ({criterion})")
    plt.show()
    return clf, acc

In [None]:
splits = [(0.9, 0.1), (0.8, 0.2), (0.7, 0.3)]
results = []
for train_size, test_size in splits:
    print(f"\n--- Train/Test Split: {int(train_size*100)}/{int(test_size*100)} ---")
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=train_size, test_size=test_size, random_state=42)
    for criterion in ["gini", "entropy"]:
        clf, acc = evaluate_model(X_train, X_test, y_train, y_test, criterion)
        results.append({"Train Size": train_size, "Test Size": test_size, "Criterion": criterion, "Accuracy": acc})


In [None]:
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df)


In [None]:
 # Train final model (on 70/30 split with 'entropy') for saving/loading
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.7, test_size=0.3, random_state=42)
final_model = DecisionTreeClassifier(criterion="entropy", random_state=42)
final_model.fit(X_train, y_train)

In [None]:
# Save the model
joblib.dump(final_model, "decision_tree_model.pkl")

# Use the same columns as in training
import pandas as pd

# You need the original feature column names from training
# Assuming you saved them like this:
feature_names = X_train.columns  # Save these when training

# Create a DataFrame with the same columns
new_sample = pd.DataFrame([[5.1, 3.5, 1.4, 0.2] + [0]*10], columns=feature_names)  # Pad the rest if needed
loaded_model = joblib.load("decision_tree_model.pkl")
prediction = loaded_model.predict(new_sample)
print("Predicted class for new sample:", prediction[0])


In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Make sure class names are strings
class_names_str = [str(cls) for cls in final_model.classes_]

# Plot the tree
plt.figure(figsize=(12, 8))
plot_tree(
    final_model,
    feature_names=list(x.columns),  # Ensure it's a list of strings
    class_names=class_names_str,
    filled=True
)
plt.title("Final Decision Tree (Entropy, 70/30)")
plt.show()



In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Try different test sizes and both criteria
split_sizes = [0.3, 0.2, 0.1]
criteria = ['gini', 'entropy']

for split in split_sizes:
    for criterion in criteria:
        print(f"\n🔹 Criterion: {criterion.upper()}, Split: {int((1-split)*100)}/{int(split*100)}")

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            x, y, test_size=split, random_state=42
        )

        # Train the model
        model = DecisionTreeClassifier(criterion=criterion, random_state=42)
        model.fit(X_train, y_train)

        # Evaluate
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.2f}")

        # Plot the tree
        plt.figure(figsize=(13, 2))
        plot_tree(
            model,
            feature_names=x.columns.astype(str),
            class_names=[str(cls) for cls in model.classes_],
            filled=True
        )
        plt.title(f"Decision Tree ({criterion.capitalize()}) — Split {int((1-split)*100)}/{int(split*100)}")
        plt.show()
