In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from tqdm import tqdm

In [None]:

transform_standard = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = datasets.ImageFolder(root="test_dataset", transform=transform_standard)

# Create DataLoader
test_loader = DataLoader(
    test_dataset, 
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)


class_names = test_dataset.classes
    
print(f"Test dataset loaded successfully!")
print(f"Number of test samples: {len(test_dataset)}")
print(f"Number of classes: {len(class_names)}")
print(f"Class names: {class_names}")

In [2]:
class CNN_model(nn.Module):
    def __init__(self, in_channel):
        super().__init__()
        
        # Convolution blocks
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(in_channel, 16, kernel_size=3, stride=1,padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(16, 32, kernel_size=3, stride=1,padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # NN block
        self.fully_connected = nn.Sequential(
            nn.Linear(16*16*64, 512),     # 32768 -> 512
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, 64),           # Another layer
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.4),
            nn.Linear(64, 9)              # Output layer (9 waste categories)
        )
    def forward(self,x):
        X = self.feature_extractor(x)
        X = torch.flatten(X,1)
        X = self.fully_connected(X)
        
        return X

In [None]:
def load_trained_model(model,ckpt_path, device):
    """
    Load trained model from checkpoint
    
    Args:
        model: Model architecture to load weights into
        ckpt_path: Path to checkpoint file
        device: Device to load model on (cuda/cpu)
    
    Returns:
        model: Loaded model in evaluation mode
    """
    
    # Load checkpoint
    print(f"Loading checkpoint from: {ckpt_path}")
    ckpt = torch.load(ckpt_path, map_location=device)
    
    # Extract state dict (handles different checkpoint formats)
    state_dict = ckpt.get("model_state_dict", ckpt)
    
    # Load weights
    model.load_state_dict(state_dict)
    model = model.to(device)
    model.eval()  # Set to evaluation mode
    
    print(f"Model loaded successfully on {device}")
    
    # Print checkpoint info if available
    if isinstance(ckpt, dict):
        if 'epoch' in ckpt:
            print(f"Checkpoint epoch: {ckpt['epoch'] + 1}")
        if 'best_val_loss' in ckpt:
            print(f"Best validation loss: {ckpt['best_val_loss']:.4f}")
    
    return model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
model = CNN_model(in_channel=3).to(device)

model_adm = load_trained_model(model, "adm_best_model.pth", device) 


In [None]:


# 1) Recreate model architecture exactly as during training
num_classes = len(class_names)  # set to your number of classes
model = models.resnet50(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, num_classes)

model_resnet = load_trained_model(model, "resnet_best_model.pth", device)


## Evaluate Model on Test Set

In [None]:
def evaluate_model(model, test_loader, test_dataset, device):
    """
    Evaluate model and return predictions and ground truth
    
    Args:
        model: Trained model
        test_loader: DataLoader for test dataset
        test_dataset: Test dataset object
        device: Device to run evaluation on
    
    Returns:
        all_predictions: All predicted labels
        all_labels: All ground truth labels
        test_loss: Average test loss
        test_accuracy: Test accuracy
    """
    model.eval()
    
    loss_fn = nn.CrossEntropyLoss()
    test_loss = 0.0
    test_correct = 0
    
    all_predictions = []
    all_labels = []
    
    print("\nEvaluating model on test set...")
    
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc="Testing"):
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            # Forward pass
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            
            # Calculate loss
            test_loss += loss.item() * images.size(0)
            
            # Get predictions
            predicted = torch.argmax(outputs, dim=1)
            test_correct += (predicted == labels).sum().item()
            
            # Store predictions and labels
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    test_loss /= len(test_dataset)
    test_accuracy = test_correct / len(test_dataset)
    
    return np.array(all_predictions), np.array(all_labels), test_loss, test_accuracy




## Calculate and Display Metrics

In [None]:
def calculate_metrics(predictions, labels, class_names):
    """
    Calculate precision, recall, F1-score for each class
    
    Args:
        predictions: Predicted labels
        labels: Ground truth labels
        class_names: List of class names
    
    Returns:
        metrics_dict: Dictionary containing all metrics
    """
    # Calculate precision, recall, f1-score, support
    precision, recall, f1, support = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )
    
    # Calculate macro and weighted averages
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, predictions, average='macro', zero_division=0
    )
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=0
    )
    
    # Create metrics dictionary
    metrics_dict = {
        'per_class': {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'support': support
        },
        'macro_avg': {
            'precision': precision_macro,
            'recall': recall_macro,
            'f1_score': f1_macro
        },
        'weighted_avg': {
            'precision': precision_weighted,
            'recall': recall_weighted,
            'f1_score': f1_weighted
        }
    }
    
    # Print detailed classification report
    print("\n" + "="*70)
    print("CLASSIFICATION REPORT")
    print("="*70)
    print(classification_report(labels, predictions, target_names=class_names, zero_division=0))
    
    # Print per-class metrics in a formatted table
    print("\n" + "="*70)
    print("PER-CLASS METRICS")
    print("="*70)
    print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
    print("-"*70)
    
    for i, class_name in enumerate(class_names):
        print(f"{class_name:<15} {precision[i]:<12.4f} {recall[i]:<12.4f} "
              f"{f1[i]:<12.4f} {support[i]:<10}")
    
    print("-"*70)
    print(f"{'Macro Avg':<15} {precision_macro:<12.4f} {recall_macro:<12.4f} "
          f"{f1_macro:<12.4f} {len(labels):<10}")
    print(f"{'Weighted Avg':<15} {precision_weighted:<12.4f} {recall_weighted:<12.4f} "
          f"{f1_weighted:<12.4f} {len(labels):<10}")
    print("="*70)
    
    return metrics_dict


## Plot Confusion Matrix

In [None]:
def plot_confusion_matrix(predictions, labels, class_names, save_path=None):
    """
    Plot confusion matrix
    
    Args:
        predictions: Predicted labels
        labels: Ground truth labels
        class_names: List of class names
        save_path: Path to save the plot (optional)
    """
    # Calculate confusion matrix
    cm = confusion_matrix(labels, predictions)
    
    # Calculate percentages
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # Create figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot 1: Absolute counts
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names,
                cbar_kws={'label': 'Count'}, ax=axes[0])
    axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('True Label', fontsize=12)
    axes[0].set_xlabel('Predicted Label', fontsize=12)
    
    # Plot 2: Percentages
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Greens',
                xticklabels=class_names, yticklabels=class_names,
                cbar_kws={'label': 'Percentage (%)'}, ax=axes[1])
    axes[1].set_title('Confusion Matrix (Percentages)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('True Label', fontsize=12)
    axes[1].set_xlabel('Predicted Label', fontsize=12)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"\nConfusion matrix saved to: {save_path}")
    
    plt.show()
    
    return cm


## Main Evaluation Function

In [None]:
def full_evaluation(ckpt_path, test_data_path, device, batch_size=32, save_dir="evaluation_results"):
    """
    Complete evaluation pipeline
    
    Args:
        ckpt_path: Path to model checkpoint
        test_data_path: Path to test dataset folder
        device: Device to run evaluation on
        batch_size: Batch size for testing
        save_dir: Directory to save evaluation results
    
    Returns:
        results: Dictionary containing all evaluation results
    """
    import os
    os.makedirs(save_dir, exist_ok=True)
    
    print("="*70)
    print("MODEL EVALUATION PIPELINE")
    print("="*70)
    
    
    # 2. Load trained model
    model = load_trained_model(ckpt_path, device)
    
    # 3. Evaluate model
    predictions, labels, test_loss, test_accuracy = evaluate_model(
        model, test_loader, test_dataset, device
    )
    
    # 4. Print basic metrics
    print("\n" + "="*70)
    print("TEST RESULTS")
    print("="*70)
    print(f"Test Loss:     {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
    print("="*70)
    
    # 5. Calculate detailed metrics
    metrics_dict = calculate_metrics(predictions, labels, class_names)
    
    # 6. Plot confusion matrix
    cm_save_path = os.path.join(save_dir, "confusion_matrix.png")
    cm = plot_confusion_matrix(predictions, labels, class_names, save_path=cm_save_path)
    
    # 7. Compile results
    results = {
        'test_loss': test_loss,
        'test_accuracy': test_accuracy,
        'predictions': predictions,
        'labels': labels,
        'confusion_matrix': cm,
        'metrics': metrics_dict,
        'class_names': class_names
    }
    
    # 8. Save results to file
    results_file = os.path.join(save_dir, "evaluation_results.txt")
    with open(results_file, 'w') as f:
        f.write("="*70 + "\n")
        f.write("MODEL EVALUATION RESULTS\n")
        f.write("="*70 + "\n\n")
        f.write(f"Test Loss: {test_loss:.4f}\n")
        f.write(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)\n\n")
        f.write("Per-Class Metrics:\n")
        f.write("-"*70 + "\n")
        f.write(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}\n")
        f.write("-"*70 + "\n")
        for i, class_name in enumerate(class_names):
            p = metrics_dict['per_class']['precision'][i]
            r = metrics_dict['per_class']['recall'][i]
            f1 = metrics_dict['per_class']['f1_score'][i]
            s = metrics_dict['per_class']['support'][i]
            f.write(f"{class_name:<15} {p:<12.4f} {r:<12.4f} {f1:<12.4f} {s:<10}\n")
    
    print(f"\nEvaluation results saved to: {results_file}")
    
    return results

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define paths
ckpt_path = "models/best_model_Adam.pth"  # Change to your checkpoint path
test_data_path = "/kaggle/input/realwaste/realwaste-main/RealWaste"  # Change to your test data path

# Run full evaluation
results = full_evaluation(
    ckpt_path=ckpt_path,
    test_data_path=test_data_path,
    device=device,
    batch_size=32,
    save_dir="evaluation_results"
)

# Access results
print(f"\nFinal Test Accuracy: {results['test_accuracy']*100:.2f}%")
