## ðŸ“Œ Model Training & Data Analysis Script
This Colab script:
- Loads and preprocesses a dataset (human or animal dataset)
- Trains multiple machine learning models
- Evaluates models and selects the best one
- Generates various performance metrics, visualizations, and feature importance scores
- Provides downloadable trained models

In [None]:
# Install necessary libraries (if not already installed)
!pip install pandas scikit-learn matplotlib seaborn joblib catboost --quiet
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import itertools
import joblib
import os

from google.colab import files
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# Define dataset type: "humans" or "animals"
dataset_type = "animals"

In [None]:
# ================================
# ðŸ“Œ Main Functions
# ================================

# Function to plot and save confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels, model_name, dataset_type, dataset_split):
    """Plot and save the confusion matrix as an image (.png) and return only the filename."""
    cm = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {model_name} ({dataset_type}) [{dataset_split}]")

    # Save as PNG Image (only return filename)
    img_filename = f"{model_name}_{dataset_type}_confusion_matrix_{dataset_split}.png"
    plt.savefig(img_filename)
    plt.close()  # Prevent extra display in notebooks

    return img_filename


def process_and_train_model(filename, use_cross_validation=True, cv_folds=5, model_choices=None):
    """
    This function loads a dataset, processes it, applies cross-validation,
    trains multiple models, evaluates them, and selects the best one.
    """

    # Load the dataset
    df = pd.read_csv(filename)
    print(f"Loaded")
    print(df.head())

    # Encode target variable
    target_column = "Class"
    label_encoder = LabelEncoder()
    df[target_column] = label_encoder.fit_transform(df[target_column])

    # Prepare feature set and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define models
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        "LightGBM": LGBMClassifier(n_estimators=100, random_state=42, verbose=-1),
        "CatBoost": CatBoostClassifier(iterations=100, verbose=0, random_state=42),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42,verbose=0),
        "MLP": MLPClassifier(hidden_layer_sizes=(100,50), max_iter=300, activation='relu', solver='adam', random_state=42, verbose=0),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=100, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "NaiveBayes": GaussianNB(),
        "SVM": SVC(kernel='rbf', probability=True, random_state=42)
    }

    # Allow selection of models to train
    if model_choices is None:
        model_choices = models.keys()  # Default: train all models

    results = {}

    # Train and evaluate each selected model
    for model_name in model_choices:
        print(f"\nTraining {model_name}...")
        model = models[model_name]
        model.fit(X_train_scaled, y_train)

        # Predictions
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)

        # Compute accuracy
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        print(f"{model_name} Train Accuracy: {train_accuracy:.4f} | Test Accuracy: {test_accuracy:.4f}")

        # Compute confusion matrices for training and testing sets
        # Compute confusion matrices & save images
        train_cm_image = plot_confusion_matrix(y_train, y_train_pred, label_encoder.classes_, model_name, dataset_type, "Train")
        test_cm_image = plot_confusion_matrix(y_test, y_test_pred, label_encoder.classes_, model_name, dataset_type, "Test")



        # Find misclassified examples
        misclassified = X_test.copy()
        misclassified["Actual Class"] = label_encoder.inverse_transform(y_test)
        misclassified["Predicted Class"] = label_encoder.inverse_transform(y_test_pred)
        misclassified = misclassified[misclassified["Actual Class"] != misclassified["Predicted Class"]]

        # Store results
        results[model_name] = {
            "model": model,
            "train_accuracy": train_accuracy,
            "test_accuracy": test_accuracy,
            "classification_report": classification_report(y_test, y_test_pred, target_names=label_encoder.classes_, output_dict=True),
            "confusion_matrix_train": train_cm_image,  # Train Confusion Matrix
            "confusion_matrix_test": test_cm_image,    # Test Confusion Matrix
            "misclassified_samples": misclassified  # Store misclassified examples
        }

    # Select best model based on test accuracy
    best_model_name = max(results, key=lambda m: results[m]['test_accuracy'])
    best_model = results[best_model_name]["model"]
    print(f"\nBest model: {best_model_name} with Test Accuracy: {results[best_model_name]['test_accuracy']:.4f}")

    # Save the best model and preprocessing tools
    joblib.dump(best_model, f"{dataset_type}_{best_model_name}_classifier.pkl")
    joblib.dump(label_encoder, f"{dataset_type}_label_encoder.pkl")
    joblib.dump(scaler, f"{dataset_type}_scaler.pkl")

    print(f"Best model ({best_model_name}) and preprocessing tools saved.")

    return df, X, y, X_test, y_test, label_encoder, best_model, results


def save_all_models(results, dataset_type="dataset"):
    """
    Saves all trained models, scalers, label encoders, and evaluation metrics
    into a structured directory for later retrieval.
    """
    # Define the base directory
    save_dir = f"{dataset_type}_models"
    os.makedirs(save_dir, exist_ok=True)

    # Create a summary file
    summary_filename = os.path.join(save_dir, "model_summary.txt")

    with open(summary_filename, "w") as summary_file:
        summary_file.write(f"Model Training Summary for {dataset_type.capitalize()} Dataset\n")
        summary_file.write("=" * 50 + "\n")

        for model_name, model_data in results.items():
            model_folder = os.path.join(save_dir, model_name)
            os.makedirs(model_folder, exist_ok=True)

            # Save the model
            model_filename = os.path.join(model_folder, f"{model_name}_classifier.pkl")
            joblib.dump(model_data["model"], model_filename)

            # Save the confusion matrices
            cm_train_filename = os.path.join(model_folder, f"{model_name}_confusion_matrix_train.npy")
            cm_test_filename = os.path.join(model_folder, f"{model_name}_confusion_matrix_test.npy")
            np.save(cm_train_filename, model_data["confusion_matrix_train"])
            np.save(cm_test_filename, model_data["confusion_matrix_test"])

            # Save misclassified samples
            misclassified_filename = os.path.join(model_folder, f"{model_name}_misclassified_samples.csv")
            model_data["misclassified_samples"].to_csv(misclassified_filename, index=False)

            # Save classification report
            classification_report_filename = os.path.join(model_folder, f"{model_name}_classification_report.json")
            with open(classification_report_filename, "w") as report_file:
                import json
                json.dump(model_data["classification_report"], report_file, indent=4)

            # Write model summary
            summary_file.write(f"Model: {model_name}\n")
            summary_file.write(f"Train Accuracy: {model_data['train_accuracy']:.4f}\n")
            summary_file.write(f"Test Accuracy: {model_data['test_accuracy']:.4f}\n")
            summary_file.write(f"Confusion Matrices saved: Yes\n")
            summary_file.write(f"Misclassified Samples saved: {misclassified_filename}\n")
            summary_file.write(f"Classification Report saved: {classification_report_filename}\n")
            summary_file.write("-" * 50 + "\n")

    print(f" All models and details saved successfully in '{save_dir}' folder.")

## Load the dataset file and train the models

In [None]:
# Prompt user to upload CSV file
print("Please upload the CSV file containing the dataset (Human or Animal).")
uploaded = files.upload()

# Get the uploaded filename
filename = list(uploaded.keys())[0]

# Process the dataset and train model
df, X, y, X_test, y_test, label_encoder, best_model, results = process_and_train_model(filename)

save_all_models(results, dataset_type)

## Show misclassified examples
Other model options are : "RandomForest", "XGBoost", "LightGBM", "CatBoost", "GradientBoosting", "MLP", "ExtraTrees", "KNN", "NaiveBayes", "SVM"  

In [None]:
selected_model = "CatBoost"  # CHANGE THIS TO THE MODEL FROM MODEL OPTIONS
cm_train = results[selected_model]["confusion_matrix_train"]
cm_test = results[selected_model]["confusion_matrix_test"]

misclassified_samples = results[selected_model]["misclassified_samples"]

for index, row in misclassified_samples.iterrows():
    print(f"Index: {index} | Actual: {row['Actual Class']} | Predicted: {row['Predicted Class']} | Features: {row.drop(['Actual Class', 'Predicted Class']).to_dict()}")

In [None]:
# ===========================
# Advanced Dataset Statistics
# ===========================

# Compute descriptive statistics including median, standard deviation, skewness, and kurtosis
statistics_df = df.describe().T  # Transpose for better readability
statistics_df["median"] = df.median(numeric_only=True)
statistics_df["std_dev"] = df.std(numeric_only=True)
statistics_df["skewness"] = df.skew(numeric_only=True)
statistics_df["kurtosis"] = df.kurtosis(numeric_only=True)

# Display dataset statistics
print("\n **Advanced Dataset Statistics:**")
print(statistics_df)

# ===========================
# Feature Correlation Matrix
# ===========================

# Compute correlation matrix
correlation_matrix = df.corr(numeric_only=True)

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

# ===========================
# Feature Correlation Matrix per Class
# ===========================

# Get unique class labels
unique_classes = df["Class"].unique()
print(unique_classes)

# Create subplots for each class
num_classes = len(unique_classes)
fig, axes = plt.subplots(1, num_classes, figsize=(6 * num_classes, 5))

# If there's only one class, axes will not be iterable, so convert to list
if num_classes == 1:
    axes = [axes]

# Generate correlation matrices for each class separately
for i, class_label in enumerate(unique_classes):
    class_df = df[df["Class"] == class_label].drop(columns=["Class"])  # Exclude target variable
    print(len(class_df))

    # Compute correlation matrix
    correlation_matrix = class_df.corr(numeric_only=True)

    # Plot the heatmap for each class
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5, ax=axes[i])
    class_name = label_encoder.inverse_transform([class_label])[0]
    axes[i].set_title(f"Feature Correlation - {class_name}")

plt.tight_layout()
plt.show()
# ===========================
# Class Distribution Analysis
# ===========================

# Compute class distribution
class_distribution = y.value_counts(normalize=True) * 100

# Display class distribution statistics
print("\n **Class Distribution (% of total):**")
print(class_distribution)

# Plot class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=y, palette="viridis")
plt.xticks(ticks=range(len(label_encoder.classes_)), labels=label_encoder.classes_)
plt.title(f"Class Distribution in Dataset")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# ===========================
# Chi-Square Test for Feature Independence
# ===========================

chi2_results = {}
for feature in X.columns:
    contingency_table = pd.crosstab(df[feature], y)
    chi2, p, _, _ = stats.chi2_contingency(contingency_table)
    chi2_results[feature] = {"Chi2": chi2, "p-value": p}

# Convert results to DataFrame for better readability
chi2_df = pd.DataFrame(chi2_results).T

# Display chi-square test results
print("\n **Chi-Square Test Results (Feature-Class Independence):**")
print(chi2_df)


In [None]:
# Get feature importance scores
model = results["RandomForest"]["model"]
feature_importances = model.feature_importances_

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(x="Importance", y="Feature", data=feature_importance_df, palette="viridis")
plt.title(f"Feature Importance in {dataset_type.capitalize()} Classification")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()


In [None]:

# Fit scaler on training data only
scaler = StandardScaler()
scaler.fit(X)  # Fit only on training data

# Transform train and test data while retaining feature names
X_train_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Convert scaled NumPy array back into DataFrame to retain feature names
X_train_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
# Select two interacting features for PDP
if "Gardening" in X.columns and "Detective Stories" in X.columns:
    interaction_features = [("Gardening", "Detective Stories")]
elif "Online Hours" in X.columns and "Coffee" in X.columns:
    interaction_features = [("Online Hours", "Coffee")]
else:
    interaction_features = []

# Ensure that X_test_scaled retains feature names
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

# Get all class labels
class_labels = label_encoder.classes_

# Create PDP for each class
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, target_class in enumerate(range(len(class_labels))):  # Iterate through all class indices (0, 1, 2)
    PartialDependenceDisplay.from_estimator(
        model, X_test_scaled, features=interaction_features, target=target_class, ax=axes[i]
    )
    axes[i].set_title(f"2D Partial Dependence Plot for {class_labels[target_class]}")

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations

# Generate all possible feature combinations
feature_combinations = list(combinations(X.columns, 2))

# Define number of feature combinations dynamically
num_combinations = len(feature_combinations)

# Define number of columns (3 per row) and calculate rows dynamically
num_cols = 3  # Adjust the number of columns per row as needed
num_rows = int(np.ceil(num_combinations / num_cols))  # Calculate required rows

# Define class colors for visualization
unique_classes = np.unique(y)  # Get unique class labels
colors = ["red", "blue", "green", "purple", "orange", "brown"]  # Define a color palette
class_colors = {class_label: colors[i % len(colors)] for i, class_label in enumerate(unique_classes)}

# Add small random noise (jitter) to avoid exact overlapping points
jitter_strength = 0.3  # Adjust as needed

# Create figure dynamically based on the number of rows and columns
plt.figure(figsize=(num_cols * 5, num_rows * 5))  # Adjust figure size dynamically

# Loop through all feature combinations and plot them dynamically
for i, (feat1, feat2) in enumerate(feature_combinations):
    plt.subplot(num_rows, num_cols, i + 1)

    # Plot each class with jitter added
    for class_label in np.unique(y):
        mask = y == class_label
        x_jittered = X.loc[mask, feat1] + np.random.uniform(-jitter_strength, jitter_strength, size=sum(mask))
        y_jittered = X.loc[mask, feat2] + np.random.uniform(-jitter_strength, jitter_strength, size=sum(mask))

        plt.scatter(
            x_jittered, y_jittered,
            color=class_colors[class_label],
            label=label_encoder.inverse_transform([class_label])[0],
            alpha=0.5,
            edgecolors="k"
        )

    plt.xlabel(feat1)
    plt.ylabel(feat2)
    plt.title(f"{feat1} vs {feat2} Classification")
    plt.legend()

plt.tight_layout()  # Optimize subplot spacing
plt.show()

