In [1]:
# BLOCK 1

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from tqdm import tqdm
import os
from datetime import datetime
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# BLOCK 6

# Load data directly from files
train_files = [os.path.join('./npz_results/train', f) for f in os.listdir('./npz_results/train') if f.endswith('.npz')]
val_files = [os.path.join('./npz_results/val', f) for f in os.listdir('./npz_results/val') if f.endswith('.npz')]
test_files = [os.path.join('./npz_results/test', f) for f in os.listdir('./npz_results/test') if f.endswith('.npz')]

def print_first_entry(files):
    if len(files) > 0:
        data = np.load(files[0], allow_pickle=True)
        images = data['images']
        labels = data['labels']
       
        # Print the first entry of images and labels
        print("NR13: First entry of images:")
        print(images[0])
        print("NR14: Shape of first entry:", images[0].shape)
       
        print("\nNR15: First entry of labels:")
        print(labels[0])

# Call the function for one of the directories
print_first_entry(train_files)




NR13: First entry of images:
[[  0   0   0 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 ...
 [255 255 255 ...   0   0   0]
 [255 255 255 ...   0   0   0]
 [255 255 255 ... 255   0   0]]
NR14: Shape of first entry: (256, 256)

NR15: First entry of labels:
0


In [3]:
# BLOCK 2

import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
import torch

class NumpyDataset(Dataset):
    def __init__(self, data_dir):
        self.files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.npz')]
        self.files = [f for f in self.files if self._get_data_length(f) == 10000]
        print(f"NR1: Found {len(self.files)} files with 10000 entries in {data_dir}")
       
    def __len__(self):
        return len(self.files)
   
    def __getitem__(self, idx):
        data = np.load(self.files[idx], allow_pickle=True)
        images = data['images']
        labels = data['labels']
        return torch.tensor(images, dtype=torch.float32).unsqueeze(1), torch.tensor(labels, dtype=torch.long)
   
    def _get_data_length(self, file_path):
        data = np.load(file_path, allow_pickle=True)
        return len(data['images'])


In [4]:
# BLOCK 3

import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModel(nn.Module):
    def __init__(self, conv1_filters=32, conv2_filters=64, conv3_filters=128, fc1_units=128, dropout_rate=0.5):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, conv1_filters, kernel_size=3, padding=1)  # Changed to 1 channel for grayscale
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(conv1_filters, conv2_filters, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(conv2_filters, conv3_filters, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(conv3_filters * 32 * 32, fc1_units)  # Adjusted the size accordingly
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(fc1_units, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [5]:
# BLOCK 4

def train_and_evaluate(model, criterion, optimizer, train_files, val_files, fold, batch_size=64):
    model.train()
    for epoch in tqdm(range(20), desc=f'Training fold {fold + 1}'):
        for file in train_files:
            data = np.load(file, allow_pickle=True)
            images = torch.tensor(data['images'], dtype=torch.float32).unsqueeze(1)
            labels = torch.tensor(data['labels'], dtype=torch.long)
            
            dataset = TensorDataset(images, labels)
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            
            for batch_images, batch_labels in dataloader:
                optimizer.zero_grad()
                outputs = model(batch_images)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
   
    model.eval()
    val_predictions = []
    val_true = []
    with torch.no_grad():
        for file in val_files:
            data = np.load(file, allow_pickle=True)
            images = torch.tensor(data['images'], dtype=torch.float32).unsqueeze(1)
            labels = torch.tensor(data['labels'], dtype=torch.long)
            
            dataset = TensorDataset(images, labels)
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
            
            for batch_images, batch_labels in dataloader:
                outputs = model(batch_images)
                _, predicted = torch.max(outputs.data, 1)
                val_predictions.extend(predicted.numpy())
                val_true.extend(batch_labels.numpy())
   
    accuracy = accuracy_score(val_true, val_predictions)
    precision = precision_score(val_true, val_predictions, average='macro')
    recall = recall_score(val_true, val_predictions, average='macro')
    f1 = f1_score(val_true, val_predictions, average='macro')
   
    return accuracy, precision, recall, f1

def train_with_folds(train_files, val_files, fold_count=7, conv1_filters=32, conv2_filters=64, conv3_filters=128, fc1_units=128, dropout_rate=0.5, batch_size=64):
    print(f"NR7: Training with {fold_count} folds")
    results = []
    best_model = None
    best_f1 = 0.0
    kf = KFold(n_splits=fold_count)
   
    for fold, (train_index, val_index) in enumerate(kf.split(train_files)):
        print(f"NR8: Starting fold {fold + 1}")
       
        train_fold_files = [train_files[i] for i in train_index]
        val_fold_files = [val_files[i] for i in val_index]
       
        model = CNNModel(conv1_filters, conv2_filters, conv3_filters, fc1_units, dropout_rate)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
       
        accuracy, precision, recall, f1 = train_and_evaluate(model, criterion, optimizer, train_fold_files, val_fold_files, fold, batch_size)
        results.append((accuracy, precision, recall, f1))
       
        if f1 > best_f1:
            best_f1 = f1
            best_model = model
   
    avg_results = np.mean(results, axis=0)
    print(f"NR9: Average results across all folds:")
    print(f"  Accuracy: {avg_results[0]}")
    print(f"  Precision: {avg_results[1]}")
    print(f"  Recall: {avg_results[2]}")
    print(f"  F1 Score: {avg_results[3]}")
   
    return best_model, results

In [6]:
# BLOCK 6

# Load data directly from files
train_files = [os.path.join('./npz_results/train', f) for f in os.listdir('./npz_results/train') if f.endswith('.npz')]
val_files = [os.path.join('./npz_results/val', f) for f in os.listdir('./npz_results/val') if f.endswith('.npz')]
test_files = [os.path.join('./npz_results/test', f) for f in os.listdir('./npz_results/test') if f.endswith('.npz')]

def print_first_entry(files):
    if len(files) > 0:
        data = np.load(files[0], allow_pickle=True)
        images = data['images']
        labels = data['labels']
       
        # Print the first entry of images and labels
        print("NR13: First entry of images:")
        print(images[0])
        print("NR14: Shape of first entry:", images[0].shape)
       
        print("\nNR15: First entry of labels:")
        print(labels[0])

# Call the function for one of the directories
print_first_entry(train_files)




NR13: First entry of images:
[[  0   0   0 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 ...
 [255 255 255 ...   0   0   0]
 [255 255 255 ...   0   0   0]
 [255 255 255 ... 255   0   0]]
NR14: Shape of first entry: (256, 256)

NR15: First entry of labels:
0


In [7]:
# BLOCK 8

# Use the best hyperparameters from the study
# best_hyperparameters = best_trial.params
best_hyperparameters = {'conv1_filters': 48, 'conv2_filters': 64, 'conv3_filters': 128, 'fc1_units': 128, 'dropout_rate': 0.3}

# Train the model with full data using the best hyperparameters
best_model, results = train_with_folds(
    train_files,
    val_files,
    fold_count=7,
    conv1_filters=best_hyperparameters['conv1_filters'],
    conv2_filters=best_hyperparameters['conv2_filters'],
    conv3_filters=best_hyperparameters['conv3_filters'],
    fc1_units=best_hyperparameters['fc1_units'],
    dropout_rate=best_hyperparameters['dropout_rate'],
    batch_size=64
)

avg_results = np.mean(results, axis=0)
print("NR16: Final training with best hyperparameters:")
print(f"  Accuracy: {avg_results[0]}")
print(f"  Precision: {avg_results[1]}")
print(f"  Recall: {avg_results[2]}")
print(f"  F1 Score: {avg_results[3]}")

now = datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")
filename = f"best_simple_images_cnn_model_{timestamp}.pth"
print(f"NR17: Model will be saved as: {filename}")

if best_model is not None:
    torch.save(best_model, filename)
    print(f"NR18: Best model saved to '{filename}'.")


NR7: Training with 7 folds
NR8: Starting fold 1


Training fold 1:   5%|▌         | 1/20 [1:50:27<34:58:43, 6627.56s/it]


KeyboardInterrupt: 

In [None]:
# BLOCK 9 - Testing the model on the test set

def evaluate_model_on_test_set(model, test_files, batch_size=64):
    model.eval()
    test_predictions = []
    test_true = []
    with torch.no_grad():
        for file in test_files:
            data = np.load(file, allow_pickle=True)
            images = torch.tensor(data['images'], dtype=torch.float32).unsqueeze(1)
            labels = torch.tensor(data['labels'], dtype=torch.long)
            
            dataset = TensorDataset(images, labels)
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
            
            for batch_images, batch_labels in dataloader:
                outputs = model(batch_images)
                _, predicted = torch.max(outputs.data, 1)
                test_predictions.extend(predicted.numpy())
                test_true.extend(batch_labels.numpy())
   
    accuracy = accuracy_score(test_true, test_predictions)
    precision = precision_score(test_true, test_predictions, average='macro')
    recall = recall_score(test_true, test_predictions, average='macro')
    f1 = f1_score(test_true, test_predictions, average='macro')
   
    print("NR19: Test set evaluation results:")
    print(f"  Accuracy: {accuracy}")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1 Score: {f1}")
   
    return accuracy, precision, recall, f1

# Load the best model
model = CNNModel(**best_hyperparameters)
model.load_state_dict(torch.load(filename))

# Evaluate the model on the test set
evaluate_model_on_test_set(model, test_files)