In [1]:
import os
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.metrics import f1_score

# Define the folder containing the images
image_folder = '/Users/louieburns/Library/CloudStorage/OneDrive-UniversityofLeeds/Year 3/AI and Machine Learning/Term 1/Coursework 1/Actual Coursework/dataoriginal/images'

# Paths to the text files
train_label_file = '/Users/louieburns/Library/CloudStorage/OneDrive-UniversityofLeeds/Year 3/AI and Machine Learning/Term 1/Coursework 1/Actual Coursework/dataoriginal/images_family_train.txt'
validation_label_file = '/Users/louieburns/Library/CloudStorage/OneDrive-UniversityofLeeds/Year 3/AI and Machine Learning/Term 1/Coursework 1/Actual Coursework/dataoriginal/images_family_validation.txt'
test_label_file = '/Users/louieburns/Library/CloudStorage/OneDrive-UniversityofLeeds/Year 3/AI and Machine Learning/Term 1/Coursework 1/Actual Coursework/dataoriginal/images_family_test.txt'

# Define label mapping
label_mapping = {
    "Boeing 707": 0,
    "Boeing 727": 1,
    "Boeing 737": 2,
    "Boeing 747": 3,
    "Boeing 757": 4,
    "Boeing 767": 5,
    "Boeing 777": 6,
    "A300": 7,
    "A310": 8,
    "A320": 9,
    "A330": 10,
    "A340": 11,
    "A380": 12,
    "ATR-42": 13,
    "ATR-72": 14,
    "An-12": 15,
    "BAE 146": 16,
    "BAE-125": 17,
    "Beechcraft 1900": 18,
    "Boeing 717": 19,
    "C-130": 20,
    "C-47": 21,
    "CRJ-200": 22,
    "CRJ-700": 23,
    "Cessna 172": 24,
    "Cessna 208": 25,
    "Cessna Citation": 26,
    "Challenger 600": 27,
    "DC-10": 28,
    "DC-3": 29,
    "DC-6": 30,
    "DC-8": 31,
    "DC-9": 32,
    "DH-82": 33,
    "DHC-1": 34,
    "DHC-6": 35,
    "Dash 8": 36,
    "DR-400": 37,
    "Dornier 328": 38,
    "Embraer E-Jet": 39,
    "EMB-120": 40,
    "Embraer ERJ 145": 41,
    "Embraer Legacy 600": 42,
    "Eurofighter Typhoon": 43,
    "F-16": 44,
    "F/A-18": 45,
    "Falcon 2000": 46,
    "Falcon 900": 47,
    "Fokker 100": 48,
    "Fokker 50": 49,
    "Fokker 70": 50,
    "Global Express": 51,
    "Gulfstream": 52,
    "Hawk T1": 53,
    "Il-76": 54,
    "L-1011": 55,
    "MD-11": 56,
    "MD-80": 57,
    "MD-90": 58,
    "Metroliner": 59,
    "King Air": 60,
    "PA-28": 61,
    "SR-20": 62,
    "Saab 2000": 63,
    "Saab 340": 64,
    "Spitfire": 65,
    "Tornado": 66,
    "Tu-134": 67,
    "Tu-154": 68,
    "Yak-42": 69
}

# Define transformations for image processing with data augmentation
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize to 128x128
    transforms.RandomHorizontalFlip(),  # Random horizontal flip
    transforms.RandomRotation(15),  # Random rotation within 15 degrees
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Random color jitter
    transforms.ToTensor()  # Convert to tensor
])

def process_data(label_file):
    image_data = []
    labels = []

    with open(label_file, "r") as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)
            if len(parts) != 2:
                continue  # Skip malformed lines
            filename, label = parts
            image_path = os.path.join(image_folder, filename + ".jpg")

            try:
                if os.path.exists(image_path) and label in label_mapping:
                    image = Image.open(image_path).convert("RGB")
                    image_tensor = transform(image)
                    image_data.append(image_tensor)

                    # Map label string to integer using label_mapping
                    label_int = label_mapping[label]
                    labels.append(label_int)
                else:
                    print(f"Warning: Label '{label}' not found in label_mapping.")
            except Exception as e:
                print(f"Error processing {image_path}: {e}")

    # Ensure consistent data sizes
    assert len(image_data) == len(labels), "Mismatch between image data and labels."

    # Convert lists to PyTorch tensors
    try:
        image_tensor = torch.stack(image_data)  # Stack image data into a single tensor
        label_tensor = torch.tensor(labels, dtype=torch.long)  # Ensure labels are integer tensors
    except Exception as e:
        print(f"Error during tensor conversion: {e}")

    # Ensure tensors have the same first dimension
    if image_tensor.size(0) != label_tensor.size(0):
        raise ValueError("Image and label tensor size mismatch: "
                         f"{image_tensor.size(0)} images vs {label_tensor.size(0)} labels.")

    return image_tensor, label_tensor

# Process training data
train_image_tensor, train_label_tensor = process_data(train_label_file)

# Create training TensorDataset
train_dataset = TensorDataset(train_image_tensor, train_label_tensor)

# Process validation data
validation_image_tensor, validation_label_tensor = process_data(validation_label_file)

# Create validation TensorDataset
validation_dataset = TensorDataset(validation_image_tensor, validation_label_tensor)

# Example usage of DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
print("Training and Validation DataLoaders created successfully with batch size:", batch_size)

# Process testing data
test_image_tensor, test_label_tensor = process_data(test_label_file)

# Create testing TensorDataset
test_dataset = TensorDataset(test_image_tensor, test_label_tensor)

# Example usage of DataLoader for testing data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print("Testing DataLoader created successfully with batch size:", batch_size)

# Define the CNN model
class EnhancedCNN(nn.Module):
    def __init__(self):
        super(EnhancedCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(128 * 16 * 16, 256)
        self.fc2 = nn.Linear(256, len(label_mapping))
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = x.view(-1, 128 * 16 * 16)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Instantiate the enhanced model
model = EnhancedCNN()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with validation
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

    # Evaluate on validation data
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in validation_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = 100 * correct / total
    print(f"Validation Loss: {val_loss/len(validation_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%")

# Evaluate on test data
model.eval()
true_labels = []
predicted_labels = []
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

# Compute F1-score
average_f1 = f1_score(true_labels, predicted_labels, average='weighted')
print(f"Average F1-Score: {average_f1:.4f}")

Training DataLoader created successfully with batch size: 64
Testing DataLoader created successfully with batch size: 64
Epoch [1/15], Loss: 4.1378
Epoch [2/15], Loss: 4.0729
Epoch [3/15], Loss: 3.9789
Epoch [4/15], Loss: 3.8675
Epoch [5/15], Loss: 3.7341
Epoch [6/15], Loss: 3.5566
Epoch [7/15], Loss: 3.3014
Epoch [8/15], Loss: 3.0455
Epoch [9/15], Loss: 2.6671
Epoch [10/15], Loss: 2.2090
Epoch [11/15], Loss: 1.7782
Epoch [12/15], Loss: 1.3288
Epoch [13/15], Loss: 1.0242
Epoch [14/15], Loss: 0.7911
Epoch [15/15], Loss: 0.6255
Accuracy on test data: 19.50%
