# Evaluation

### Library import

In [None]:
from super_gradients.common.object_names import Models
from super_gradients.training import models

In [None]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from torchvision.transforms import Compose
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_recall_curve, average_precision_score, PrecisionRecallDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle
from matplotlib import pyplot as plt

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

### Paths organization

In [None]:
# Dataset
pwd_notebook = os.path.abspath('') # path notebook
root_path = os.path.dirname(os.path.dirname(pwd_notebook)) #path root project

data_dir = os.path.join(root_path, 'data') #path data

# train path
train_dir = os.path.join(data_dir, 'train')
print(train_dir)
# test path
test_dir = os.path.join(data_dir, 'test')
print(test_dir)
# valid path
valid_dir = os.path.join(data_dir, 'valid')
print(valid_dir)

In [None]:
# Get name from experiments
def extraction(path):
    match = re.search(r'([^/]+)/RUN_\d{8}_\d{6}_\d+', path)
    if match:
        exp_name = match.group(1)
        run_name = match.group(0).split('/')[-1]
        return run_name, exp_name
    else:
        return None, None

In [None]:
# Checkpoints
checkpoint_path = '' #Add absolute path of the checkpoint


run_name, exp_name = extraction(checkpoint_path)

n_classes = 4 #Add number of classes

#Add name of pre-trained weights
name_pretrained_weights = 'imagenet'

#Add size of image for resize
size_image = (224, 224)

In [None]:
# Get pararmeters from experiments file checkpoints

def parameters (exps_path):
    with open(exps_path, 'r') as file:
        content = file.read()
        
    opt_p = r'"optimizer":\s*"([^"]+)"'
    lr_p = r'"initial_lr":\s*([\d.]+)'
    epochs_p = r'"max_epochs":\s*(\d+)'
    device_p = r'"device_type":\s*"([^"]+)"'
    
    optimizer = re.search(opt_p, content)
    initial_lr = re.search(lr_p, content)
    max_epochs = re.search(epochs_p, content)
    device_type = re.search(device_p, content)
    
    optimizer = optimizer.group(1) if optimizer else None
    initial_lr = initial_lr.group(1) if initial_lr else None
    max_epochs = max_epochs.group(1) if max_epochs else None
    device_type = device_type.group(1) if device_type else None
    
    return {
        "optimizer": optimizer,
        "initial_lr": initial_lr,
        "max_epochs": max_epochs,
        "device_type": device_type,
    }
    

### load model

In [None]:
model = models.get(model_name=Models.RESNET50, #Add model name
                   num_classes=n_classes, 
                   pretrained_weights=name_pretrained_weights, 
                   checkpoint_num_classes=n_classes, 
                   checkpoint_path=checkpoint_path)
model.to(device)

### Load Weights

In [None]:
# Load the .pth file
checkpoint = torch.load(checkpoint_path)

# Check if the key 'state_dict' or some similar key exists
if 'state_dict' in checkpoint:
    model.load_state_dict(checkpoint['state_dict'])
elif 'net' in checkpoint:
    model.load_state_dict(checkpoint['net'])
else:
    # Whether the .pth file directly contains the model state
    model.load_state_dict(checkpoint)

model.to(device)

### Prepare test data

In [None]:
# Defines transformations if they are not defined
test_transform = Compose([
    # Here are the same transformations that you used for training
    transforms.Resize((size_image)),
    transforms.ToTensor(),
])

# Load test data
test_data = datasets.ImageFolder(root=test_dir, transform=test_transform)

### Load Test DataLoader

In [None]:
test_data = datasets.ImageFolder(root=test_dir, transform=test_transform)

test_dataloader = DataLoader(dataset=test_data, 
                             batch_size=16, # Adjust as needed
                             num_workers=2, # Adjust based on your available resources
                             shuffle=False)

### Inference

In [None]:
# Get an image from the test set (replace 'index' with the index of the image you want)
index = 9# Change this to the desired image index
image, true_label = test_data[index]  # Get the image and its true label

# Perform model inference on the image
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    image = image.unsqueeze(0)  # Add a batch dimension since the model expects batches
    output = model(image.to(device))
     # Convert logits to probabilities
    probabilities = F.softmax(output, dim=1)
    _, predicted_label = torch.max(output, 1)  # Get the predicted label
    predicted_score = probabilities[0][predicted_label.item()].item()  # Get the probability of the predicted label

# Get the name of the predicted class from the index
predicted_class = test_data.classes[predicted_label.item()]
true_class = test_data.classes[true_label]


# Show image along with predicted and true label
plt.imshow(image.squeeze().permute(1, 2, 0))  # Show image (make sure dimensions are appropriate)
plt.title(f"Predicted: {predicted_class} \n Score: {predicted_score*100:.2f}% \nTrue: {true_class}")
plt.axis('off')
plt.show()

### Show Df inference

In [None]:
pd.set_option('display.max_colwidth', 1000)
# Create table with the results de test (true, predict, score and path image)
results = []
for i, (image, true_label) in enumerate(test_data):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        image = image.unsqueeze(0)  # Add a batch dimension since the model expects batches
        output = model(image.to(device))
        # Convert logits to probabilities
        probabilities = F.softmax(output, dim=1)
        _, predicted_label = torch.max(output, 1)  # Get the predicted label
        predicted_score = probabilities[0][predicted_label.item()].item()  # Get the probability of the predicted label
        # pass predicted_score to percentage with 2 decimals
        predicted_score = "{:.2f}".format(predicted_score*100)
    # Get the name of the predicted class from the index
    predicted_class = test_data.classes[predicted_label.item()]
    true_class = test_data.classes[true_label]
        
    # Add new column with "X" where the true class is equal to the predicted class
    if true_class == predicted_class:
        true_true = ' (X)'
    else:
        true_true = ' ( )'
    
    # in the path image add name folder and image and not path complete
    # results.append([true_class, predicted_class, predicted_score, test_data.imgs[i][0].split('\\')[-2] + '/' + test_data.imgs[i][0].split('\\')[-1], true_true])    
    
    # in the path image add path complete
    results.append([true_class, predicted_class, predicted_score, test_data.imgs[i][0], true_true])

# Create a DataFrame from the results
results_df = pd.DataFrame(results, columns=['True', 'Predicted', 'Score', 'Image Path', 'Correct'])

# Save the DataFrame to a XLS file
results_df.to_excel(f'../../checkpoints/clasificador/{exp_name}/{run_name}/{run_name}.xlsx', index=False)

results_df

### Accuracy and Classification report

In [None]:
all_preds = []
all_labels = []

# Evaluate the model on the test data set
model.eval()
with torch.no_grad():
    for images, labels in test_dataloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate general metrics
accuracy = accuracy_score(all_labels, all_preds)
conf_matrix = confusion_matrix(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=test_data.classes, zero_division=0, digits=5)

print(f"\n Accuracy: {accuracy}\n")
print("Classification Report:\n", report)

### Confusion Matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='g', xticklabels=test_data.classes, yticklabels=test_data.classes, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

### Precision-Recall Multiclass

In [None]:
#Obtain Probabilities of each Class
y_true = []
y_scores = []

model.eval()
with torch.no_grad():
    for images, labels in test_dataloader:
        images = images.to(device)
        outputs = model(images)
        probabilities = torch.softmax(outputs, dim=1).cpu().numpy()

        y_true.extend(labels.cpu().numpy())
        y_scores.extend(probabilities)

y_true = np.array(y_true)
y_scores = np.array(y_scores)

# Binarize tags
y_true_bin = label_binarize(y_true, classes=np.arange(len(test_data.classes)))
n_classes = len(test_data.classes)

# Calculate average precision and recall for each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i], y_scores[:, i])
    average_precision[i] = average_precision_score(y_true_bin[:, i], y_scores[:, i])

# Colors for different classes
colors = cycle(['blue', 'red', 'green', 'yellow', 'orange', 'purple'])

# Draw Precision-Recall curves and iso-F1 curves
plt.figure(figsize=(7, 8))
f_scores = np.linspace(0.2, 0.8, num=6)
for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    # l, = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2) # iso-F1 curves
    plt.annotate(f"f1={f_score:0.1f}", xy=(0.9, y[45] + 0.02))

for i, color in zip(range(n_classes), colors):
    plt.plot(recall[i], precision[i], color=color, lw=2,
             label=f'Precision-Recall curve of {test_data.classes[i]} (AP = {average_precision[i]:0.2f})')

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend(loc="best")
plt.title("Precision-Recall curve to multi-class")
plt.show()

### ROC Curve || FPR vs TNR

In [None]:
# Binarize tags
y_true_bin = label_binarize(y_true, classes=np.arange(len(test_data.classes)))
n_classes = len(test_data.classes)

# Calculate ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Colors for different classes
colors = cycle(['blue', 'red', 'green', 'yellow', 'orange', 'purple'])

# Draw the ROC curve for each class
plt.figure(figsize=(10, 8))
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of {test_data.classes[i]} (area = {roc_auc[i]:0.2f})')

# plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Draw line for classifier
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (Fall out)')
plt.ylabel('True Positive Rate (Sensivity)')
plt.title('ROC Curve for Multi-Class FPR vs TPR')
plt.legend(loc="lower right")
plt.show()

### ROC Curve || Specificty vs Sensibility

In [None]:
# Binarize tags
y_true_bin = label_binarize(y_true, classes=np.arange(len(test_data.classes)))
n_classes = len(test_data.classes)

# Colors for different classes
colors = cycle(['blue', 'red', 'green', 'yellow', 'orange', 'purple'])

plt.figure(figsize=(10, 8))

# Calculate ROC curve (TNR and TPR) and ROC area (AROC) for each class
for i, color in zip(range(n_classes), colors):
    fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_scores[:, i])
    tnr = 1 - fpr
    roc_auc = auc(tnr, tpr)

    plt.plot(tnr, tpr, color=color, lw=2, label=f'{test_data.classes[i]} (AROC = {roc_auc:.2f})')

plt.xlabel('True Negatives Rate (TNR - Specificity)')
plt.ylabel('True Positive Rate (TPR - Sensibility)')
plt.title('ROC Curve for Multi-Class Specificity vs Sensibility')
plt.legend(loc="best")
plt.show()

### Correct and incorrect prediction balance

In [None]:
exps_path = "" # Add absolute path experiments logs
info = parameters(exps_path)
optimizer = info['optimizer']
lr = info['initial_lr']
epochs = info['max_epochs']
device = info['device_type']
print (f'Optimizer: {optimizer}\nLr: {lr}\nEpochs: {epochs}\nDevice: {device}' )


In [None]:
copy_df = results_df.copy()
copy_df['Score'] = copy_df['Score'].astype(float)
copy_df['Correct'] = copy_df['Correct'].apply(lambda x: 1 if x.strip() == '(X)' else 0)

min_score = copy_df['Score'].min()
max_score = copy_df['Score'].max()

# Expand the range slightly to include edge cases
range_expand = (max_score - min_score) * 0.01
min_score -= range_expand
max_score += range_expand

# Create exactly 10 bins within the expanded range
bins = pd.interval_range(start=min_score, end=max_score, periods=10, closed='right')

# Assign each 'Score' to a bin
copy_df['Score_bin'] = pd.cut(copy_df['Score'], bins, include_lowest=True)

# Group by the bins and sum the 'Correct' values
grouped = copy_df.groupby('Score_bin', observed=False)['Correct'].sum().reset_index()

# Format the bin labels for the x-axis
grouped['Score_bin'] = grouped['Score_bin'].apply(lambda x: f"{x.left:.1f} - {x.right:.1f}")

y_max = 100  # Customize based on data

# Plot the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Score_bin', y='Correct', data=grouped, color='blue')

plt.xlabel('Score Range')
plt.ylabel('Correct predictions')
plt.ylim(0, y_max)
plt.title(f'{run_name}, {epochs} epochs, lr {lr}, {optimizer} - Correct')
plt.xticks(rotation=45)
plt.grid(False)  # Enable grid for better readability or don't
plt.show()

In [None]:
copy_df['Incorrect'] = copy_df['Correct'].apply(lambda x: 1 - x)
grouped_incorrect = copy_df.groupby('Score_bin', observed=False)['Incorrect'].sum().reset_index()
grouped_incorrect['Score_bin'] = grouped_incorrect['Score_bin'].apply(lambda x: f"{x.left:.1f} - {x.right:.1f}")

plt.figure(figsize=(10, 6))
sns.barplot(x='Score_bin', y='Incorrect', data=grouped_incorrect, color='red')
plt.xlabel('Score Range')
plt.ylabel('Incorrect predictions')
plt.ylim(0, y_max)  # Set the Y-axis maximum value
plt.title(f'{run_name}, {epochs} epochs, lr {lr}, {optimizer} - Incorrect')
plt.xticks(rotation=45)
plt.grid(False)
plt.show()

In [None]:
total_correct = copy_df['Correct'].sum()
total_incorrect = copy_df['Incorrect'].sum()

print(f"Correct: {total_correct}\nIncorrect: {total_incorrect}")