In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
import pandas as pd
from tabulate import tabulate

# Hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 2
confidence_margin = 0.2
absolute_confidence_threshold = 0.9
unconfident_penalty = 5.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transformations with data augmentation and normalization
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(28, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load FashionMNIST dataset
train_dataset = FashionMNIST(root="./data", train=True, transform=transform, download=True)
test_dataset = FashionMNIST(root="./data", train=False, transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
]), download=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
actual_labels = []
for _, labels in test_loader:
    actual_labels.extend(labels.numpy())
actual_labels = list(map(int, actual_labels))


# Define a CNN model with batch normalization and dropout
class HighConfidenceFashionCNN(nn.Module):
    def __init__(self):
        super(HighConfidenceFashionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 3 * 3, 256)
        self.dropout1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(128, 10)  # 10 classes

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = x.view(-1, 128 * 3 * 3)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

model = HighConfidenceFashionCNN().to(device)

# Optimizer and learning rate scheduler
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=num_epochs, anneal_strategy="linear"
)

# Improved loss function to further prioritize high-confidence predictions
def custom_loss(outputs, labels, confidence_margin=0.2):
    base_loss = F.cross_entropy(outputs, labels, reduction='none')
    probs = F.softmax(outputs, dim=1)
    top2_probs, _ = torch.topk(probs, 2, dim=1)
    top_class_prob = top2_probs[:, 0]
    second_class_prob = top2_probs[:, 1]
    
    confidence_penalty = torch.where(
        (top_class_prob - second_class_prob) < confidence_margin,
        unconfident_penalty * ((confidence_margin - (top_class_prob - second_class_prob)) ** 2),
        torch.zeros_like(base_loss)
    )

    # Combine base loss and confidence penalty
    loss = base_loss + confidence_penalty
    return loss.mean()

def calculate_accuracy(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy


# Training function without progress bar
def train(model, loader, optimizer, epoch, max_epochs):
    model.train()
    total_loss = 0.0
    
    # Gradually increase the confidence margin as the model improves
    dynamic_confidence_margin = 0.2 + 0.01 * (epoch / max_epochs)
    
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = custom_loss(outputs, labels, confidence_margin=dynamic_confidence_margin)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)







# Evaluation function to generate predictions DataFrame
def evaluate_with_temperature(model, loader, temperature=1.0):
    model.eval()
    sample_ids = []
    predicted_labels = []

    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            
            # Apply temperature scaling
            outputs = outputs / temperature
            probs = F.softmax(outputs, dim=1)  # Get probabilities with temperature scaling
            
            # Get top 2 predictions for each sample
            top2_probs, top2_classes = torch.topk(probs, 2, dim=1)

            for i in range(len(labels)):
                sample_id = batch_idx * batch_size + i
                top_class_prob = top2_probs[i][0].item()
                second_class_prob = top2_probs[i][1].item()
                
                # Determine if the prediction is confident enough based on threshold
                if (top_class_prob - second_class_prob) >= confidence_margin and top_class_prob >= absolute_confidence_threshold:
                    predicted_class = top2_classes[i][0].item()
                else:
                    predicted_class = -1  # Predict -1 for unconfident

                # Append to results
                sample_ids.append(sample_id)
                predicted_labels.append(predicted_class)

    # Return IDs and predictions
    return sample_ids, predicted_labels



def calculate_summary(actual_labels, predicted_labels, confidence_threshold=0.90):
    # Determine the number of classes dynamically
    classes = sorted(set(actual_labels) - {-1})
    num_classes = len(classes)

    # Initialize summary data
    class_counts = {cls: 0 for cls in classes}
    predictions_made = {cls: 0 for cls in classes}
    correct_predictions = {cls: 0 for cls in classes}
    unconfident_predictions = {cls: 0 for cls in classes}
    
    # Additional thresholds for score calculation
    thresholds = [0.90, 0.95, 0.99]
    scores_for_thresholds = {f"Score_{threshold}": {cls: 0 for cls in classes} for threshold in thresholds}
    scores_for_thresholds["Score_Contribution"] = {cls: 0 for cls in classes}  # Custom threshold score

    # Process each prediction
    for actual, predicted in zip(actual_labels, predicted_labels):
        class_counts[actual] += 1
        if predicted == actual:
            correct_predictions[actual] += 1
        if predicted == -1:
            unconfident_predictions[actual] += 1
        elif predicted in classes:
            predictions_made[predicted] += 1
    
    # Calculate accuracy and score contribution for each class and each threshold
    data = []
    for cls in classes:
        if predictions_made[cls] > 0:
            accuracy = correct_predictions[cls] / predictions_made[cls]
        else:
            accuracy = 0

        # Calculate score contributions for each threshold
        score_contributions = {}
        for threshold in thresholds:
            if accuracy >= threshold:
                score_contribution = correct_predictions[cls]
            else:
                score_contribution = -2 * predictions_made[cls]
            score_contributions[f"Score_{threshold}"] = score_contribution

        # Calculate score contribution for the custom threshold
        if accuracy >= confidence_threshold:
            score_contribution = correct_predictions[cls]
        else:
            score_contribution = -2 * predictions_made[cls]

        # Append the row to data with all calculated values
        data.append({
            "Class": cls,
            "Actual": class_counts[cls],
            "Predicted": predictions_made[cls],
            "Correct": correct_predictions[cls],
            "Incorrect": predictions_made[cls] - correct_predictions[cls],
            "Unconfident (-1)": unconfident_predictions[cls],
            "Accuracy": accuracy,
            "Score_Contribution": score_contribution,  # Custom threshold column
            **score_contributions  # Add other threshold score columns dynamically
        })

    # Create a summary DataFrame with scores for each threshold
    summary_df = pd.DataFrame(data)

    # Calculate totals for each relevant column
    total_row = {
        "Class": "Total",
        "Actual": sum(class_counts.values()),
        "Predicted": sum(predictions_made.values()),
        "Correct": sum(correct_predictions.values()),
        "Incorrect": sum(predictions_made.values()) - sum(correct_predictions.values()),
        "Unconfident (-1)": sum(unconfident_predictions.values()),
        "Accuracy": "",  # Leave Accuracy empty in the total row
        "Score_Contribution": summary_df["Score_Contribution"].sum(),  # Custom threshold total
    }

    # Add totals for each additional score threshold
    for threshold in thresholds:
        total_row[f"Score_{threshold}"] = summary_df[f"Score_{threshold}"].sum()

    # Convert total_row to DataFrame and use pd.concat to add it to summary_df
    total_df = pd.DataFrame([total_row])
    summary_df = pd.concat([summary_df, total_df], ignore_index=True)

    return summary_df





num_epochs = 10
for epoch in range(num_epochs):
    avg_train_loss = train(model, train_loader, optimizer, epoch, num_epochs)
    overall_accuracy = calculate_accuracy(model, test_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Training Loss: {avg_train_loss:.4f}, Overall Test Accuracy: {overall_accuracy:.2f}%")

    # Evaluate and print summary every 10 epochs
    if epoch >= num_epochs - 10 or epoch % 10 == 0:
        sample_ids, predicted_labels = evaluate_with_temperature(model, test_loader, temperature=temperature)
        summary_df = calculate_summary(actual_labels, predicted_labels, confidence_threshold=0.99)
        print(f"\nSummary for Confidence Threshold 0.99 - Epoch {epoch+1}")
        print(tabulate(summary_df, headers='keys', tablefmt='fancy_grid'))

Epoch [1/10], Average Training Loss: 0.8311
Epoch [1/10], Overall Test Accuracy: 81.59%

Summary for Confidence Threshold 0.99 - Epoch 1
╒════╤═════════╤══════════╤═════════════╤═══════════╤═════════════╤════════════════════╤════════════╤══════════════════════╤═════════════╤══════════════╤══════════════╕
│    │ Class   │   Actual │   Predicted │   Correct │   Incorrect │   Unconfident (-1) │ Accuracy   │   Score_Contribution │   Score_0.9 │   Score_0.95 │   Score_0.99 │
╞════╪═════════╪══════════╪═════════════╪═══════════╪═════════════╪════════════════════╪════════════╪══════════════════════╪═════════════╪══════════════╪══════════════╡
│  0 │ 0       │     1000 │           0 │         0 │           0 │               1000 │ 0.0        │                    0 │           0 │            0 │            0 │
├────┼─────────┼──────────┼─────────────┼───────────┼─────────────┼────────────────────┼────────────┼──────────────────────┼─────────────┼──────────────┼──────────────┤
│  1 │ 1       │  

Epoch [1/10], Average Training Loss: 0.2322
Epoch [1/10], Overall Test Accuracy: 92.01%
Epoch [2/10], Average Training Loss: 0.2233
Epoch [2/10], Overall Test Accuracy: 91.66%
Epoch [3/10], Average Training Loss: 0.2203
Epoch [3/10], Overall Test Accuracy: 92.33%
Epoch [4/10], Average Training Loss: 0.2171
Epoch [4/10], Overall Test Accuracy: 92.17%
Epoch [5/10], Average Training Loss: 0.2180
Epoch [5/10], Overall Test Accuracy: 92.35%
Epoch [6/10], Average Training Loss: 0.2133
Epoch [6/10], Overall Test Accuracy: 92.23%
Epoch [7/10], Average Training Loss: 0.2129
Epoch [7/10], Overall Test Accuracy: 92.46%
Epoch [8/10], Average Training Loss: 0.2115
Epoch [8/10], Overall Test Accuracy: 92.40%
Epoch [9/10], Average Training Loss: 0.2085
Epoch [9/10], Overall Test Accuracy: 92.50%
Epoch [10/10], Average Training Loss: 0.2088
Epoch [10/10], Overall Test Accuracy: 92.33%



Summary for Confidence Threshold 0.99 - Epoch 10
╒════╤═════════╤══════════╤═════════════╤═══════════╤═════════════╤════════════════════╤════════════════════╤══════════════════════╤═════════════╤══════════════╤══════════════╕
│    │ Class   │   Actual │   Predicted │   Correct │   Incorrect │   Unconfident (-1) │ Accuracy           │   Score_Contribution │   Score_0.9 │   Score_0.95 │   Score_0.99 │
╞════╪═════════╪══════════╪═════════════╪═══════════╪═════════════╪════════════════════╪════════════════════╪══════════════════════╪═════════════╪══════════════╪══════════════╡
│  0 │ 0       │     1000 │         645 │       625 │          20 │                360 │ 0.9689922480620154 │                -1290 │         625 │          625 │        -1290 │
├────┼─────────┼──────────┼─────────────┼───────────┼─────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────┼──────────────┼──────────────┤
│  1 │ 1       │     1000 │         981 │       980 │           1

In [8]:
# Temperature scaling parameter
temperature = 2.0  # Adjust temperature to a suitable value (can be tuned)
def calculate_summary(actual_labels, predicted_labels, confidence_threshold=0.90):
    # Determine the number of classes dynamically
    classes = sorted(set(actual_labels) - {-1})
    num_classes = len(classes)

    # Initialize summary data
    class_counts = {cls: 0 for cls in classes}
    predictions_made = {cls: 0 for cls in classes}
    correct_predictions = {cls: 0 for cls in classes}
    unconfident_predictions = {cls: 0 for cls in classes}
    
    # Additional thresholds for score calculation
    thresholds = [0.90, 0.95, 0.99]
    scores_for_thresholds = {f"Score_{threshold}": {cls: 0 for cls in classes} for threshold in thresholds}
    scores_for_thresholds["Score_Contribution"] = {cls: 0 for cls in classes}  # Custom threshold score

    # Process each prediction
    for actual, predicted in zip(actual_labels, predicted_labels):
        class_counts[actual] += 1
        if predicted == actual:
            correct_predictions[actual] += 1
        if predicted == -1:
            unconfident_predictions[actual] += 1
        elif predicted in classes:
            predictions_made[predicted] += 1
    
    # Calculate accuracy and score contribution for each class and each threshold
    data = []
    for cls in classes:
        if predictions_made[cls] > 0:
            accuracy = correct_predictions[cls] / predictions_made[cls]
        else:
            accuracy = 0

        # Calculate score contributions for each threshold
        score_contributions = {}
        for threshold in thresholds:
            if accuracy >= threshold:
                score_contribution = correct_predictions[cls]
            else:
                score_contribution = -2 * predictions_made[cls]
            score_contributions[f"Score_{threshold}"] = score_contribution

        # Calculate score contribution for the custom threshold
        if accuracy >= confidence_threshold:
            score_contribution = correct_predictions[cls]
        else:
            score_contribution = -2 * predictions_made[cls]

        # Append the row to data with all calculated values
        data.append({
            "Class": cls,
            "Actual": class_counts[cls],
            "Predicted": predictions_made[cls],
            "Correct": correct_predictions[cls],
            "Incorrect": predictions_made[cls] - correct_predictions[cls],
            "Unconfident (-1)": unconfident_predictions[cls],
            "Accuracy": accuracy,
            "Score_Contribution": score_contribution,  # Custom threshold column
            **score_contributions  # Add other threshold score columns dynamically
        })

    # Create a summary DataFrame with scores for each threshold
    summary_df = pd.DataFrame(data)

    # Calculate totals for each relevant column
    total_row = {
        "Class": "Total",
        "Actual": sum(class_counts.values()),
        "Predicted": sum(predictions_made.values()),
        "Correct": sum(correct_predictions.values()),
        "Incorrect": sum(predictions_made.values()) - sum(correct_predictions.values()),
        "Unconfident (-1)": sum(unconfident_predictions.values()),
        "Accuracy": "",  # Leave Accuracy empty in the total row
        "Score_Contribution": summary_df["Score_Contribution"].sum(),  # Custom threshold total
    }

    # Add totals for each additional score threshold
    for threshold in thresholds:
        total_row[f"Score_{threshold}"] = summary_df[f"Score_{threshold}"].sum()

    # Convert total_row to DataFrame and use pd.concat to add it to summary_df
    total_df = pd.DataFrame([total_row])
    summary_df = pd.concat([summary_df, total_df], ignore_index=True)

    return summary_df

# Use evaluate_with_temperature in place of evaluate
sample_ids, predicted_labels = evaluate_with_temperature(model, test_loader, temperature=temperature)
summary_df = calculate_summary(actual_labels, predicted_labels, confidence_threshold=0.99)
print(f"\nSummary with Temperature Scaling (T={temperature})")
print(tabulate(summary_df, headers='keys', tablefmt='fancy_grid'))



Summary with Temperature Scaling (T=2.0)
╒════╤═════════╤══════════╤═════════════╤═══════════╤═════════════╤════════════════════╤════════════════════╤══════════════════════╤═════════════╤══════════════╤══════════════╕
│    │ Class   │   Actual │   Predicted │   Correct │   Incorrect │   Unconfident (-1) │ Accuracy           │   Score_Contribution │   Score_0.9 │   Score_0.95 │   Score_0.99 │
╞════╪═════════╪══════════╪═════════════╪═══════════╪═════════════╪════════════════════╪════════════════════╪══════════════════════╪═════════════╪══════════════╪══════════════╡
│  0 │ 0       │     1000 │         402 │       401 │           1 │                594 │ 0.9975124378109452 │                  401 │         401 │          401 │          401 │
├────┼─────────┼──────────┼─────────────┼───────────┼─────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────┼──────────────┼──────────────┤
│  1 │ 1       │     1000 │         959 │       959 │           0 │      