In [None]:
from datetime import datetime
import json
import math
import os

import numpy as np
from PIL import Image
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import models, transforms
from tqdm import tqdm
import pytz

from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
class PathLabelProcessor:
    def __init__(self,
                 base_path,
                 folder_name,
                 lesions):
        self.base_path = base_path
        self.folder_name = folder_name
        self.lesions = lesions
        
        self.label_images()
        
    def find_folders_by_name(self):
        matching_folders = []

        for root, dirs, files in os.walk(self.base_path):
            for dir_name in dirs:
                if self.folder_name in dir_name:
                    folder_path = os.path.join(root, dir_name)
                    matching_folders.append(folder_path)

        return matching_folders

    def find_image_json_pairs(self, folder_path):
        image_paths = []
        json_paths = []

        for root, dirs, files in os.walk(folder_path):
            for image_file in filter(lambda x: x.lower().endswith(('jpg', 'png')), files):
                image_path = os.path.join(root, image_file)
                json_file = f"{os.path.splitext(image_path)[0]}.json"
                if os.path.isfile(json_file):
                    image_paths.append(image_path)
                    json_paths.append(json_file)

        return image_paths, json_paths

    def label_images(self):
        self.labeled_image_paths = []

        for folder_path in self.find_folders_by_name():
            image_paths, json_paths = self.find_image_json_pairs(folder_path)
            
            for image_path, json_path in zip(image_paths, json_paths):
                with open(json_path, encoding='utf-8') as f:
                    data = json.load(f)

                label = 1 if data['metaData']['lesions'] == self.lesions else 0
                self.labeled_image_paths.append((image_path, label))
            
        asymptomatic_count = sum(1 for _, label in self.labeled_image_paths if label == 0)
        symptomatic_count = sum(1 for _, label in self.labeled_image_paths if label == 1)

        print(f'Total cases: {len(self.labeled_image_paths)}')
        print(f'Number of asymptomatic cases: {asymptomatic_count}, Number of symptomatic cases: {symptomatic_count}')
        
        weight_class_0 = 1.0 / asymptomatic_count
        weight_class_1 = 1.0 / symptomatic_count
        self.class_weights = torch.tensor([weight_class_0, weight_class_1])

In [None]:
%%time
base_path = 'C:\Users\user\Desktop\pythonProject\skin\01. data\1.Training'
folder_name = '일반'
pet_type = 'dog'
lesions = 'A1'

processor = PathLabelProcessor(base_path=base_path, folder_name=folder_name, pet_type=pet_type lesions=lesions)

data = processor.labeled_image_paths
class_weights = processor.class_weights

Total cases: 433822
Number of symptomatic cases: 209523, Number of asymptomatic cases: 224299
CPU times: user 1min 13s, sys: 26.6 s, total: 1min 40s
Wall time: 9min 55s


In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path, label = self.data[idx]
        image = Image.open(image_path)
        image = self.transform(image)

        return image, label

class ImageDataset():
    def __init__(self,
                 data,
                 transform,
                 test_size,
                 seed,
                 batch_size,
                 shuffle,
                 num_workers):
        dataset = self.make_dataset(data, transform, test_size, seed)
        self.dataloader = self.make_dataloader(dataset, batch_size, shuffle, num_workers)
        
        
    def make_dataset(self, data, transform, test_size=None, seed=42):
        if test_size:
            train_data, val_data = train_test_split(data, 
                                                    test_size=test_size,
                                                    random_state=seed)
            dataset_dict = {'train': train_data,
                            'val': val_data}
        else:
            dataset_dict = {'test' : data}

        dataset = {k: CustomDataset(v, transform[k])
                   for k, v in dataset_dict.items()}
        
        return dataset
        
    def make_dataloader(self, dataset, batch_size, shuffle, num_workers):
        dataloader = {k: DataLoader(dataset=dataset[k],
                                    batch_size=batch_size,
                                    shuffle=shuffle,
                                    num_workers=num_workers,
                                    pin_memory=True)
                      for k in dataset.keys()}
        
        return dataloader

In [None]:
%%time
transform = {'train': transforms.Compose([transforms.Resize((240, 240)),
                                          transforms.RandomHorizontalFlip(),
                                          transforms.RandomVerticalFlip(),
                                          transforms.RandomRotation(degrees=10),
                                          transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
                                          transforms.ToTensor(),
                                          transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]),
             'val': transforms.Compose([transforms.Resize((240, 240)),
                                        transforms.ToTensor(),
                                        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])}
test_size = 0.15
seed = 42
batch_size = 256
shuffle = True
num_workers = os.cpu_count()

dataloader = ImageDataset(data=data,
                          transform=transform,
                          test_size=test_size,
                          seed=seed,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          num_workers=num_workers)

Class Distribution for train:
  Class 0: 167708 samples
  Class 1: 179349 samples
Class Distribution for val:
  Class 0: 41815 samples
  Class 1: 44950 samples
CPU times: user 3min 31s, sys: 3min 44s, total: 7min 16s
Wall time: 1h 30min 58s


In [None]:
%%time

class CosineAnnealingWarmUpRestarts(torch.optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, T_0, T_mult=1, eta_max=0.1, T_up=0, gamma=1., last_epoch=-1):
        if T_0 <= 0 or not isinstance(T_0, int):
            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
        if T_mult < 1 or not isinstance(T_mult, int):
            raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
        if T_up < 0 or not isinstance(T_up, int):
            raise ValueError("Expected positive integer T_up, but got {}".format(T_up))
        self.T_0 = T_0
        self.T_mult = T_mult
        self.base_eta_max = eta_max
        self.eta_max = eta_max
        self.T_up = T_up
        self.T_i = T_0
        self.gamma = gamma
        self.cycle = 0
        self.T_cur = last_epoch
        super(CosineAnnealingWarmUpRestarts, self).__init__(optimizer, last_epoch)
    
    def get_lr(self):
        if self.T_cur == -1:
            return self.base_lrs
        elif self.T_cur < self.T_up:
            return [(self.eta_max - base_lr)*self.T_cur / self.T_up + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.eta_max - base_lr) * (1 + math.cos(math.pi * (self.T_cur-self.T_up) / (self.T_i - self.T_up))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.T_cur = self.T_cur + 1
            if self.T_cur >= self.T_i:
                self.cycle += 1
                self.T_cur = self.T_cur - self.T_i
                self.T_i = (self.T_i - self.T_up) * self.T_mult + self.T_up
        else:
            if epoch >= self.T_0:
                if self.T_mult == 1:
                    self.T_cur = epoch % self.T_0
                    self.cycle = epoch // self.T_0
                else:
                    n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
                    self.cycle = n
                    self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
                    self.T_i = self.T_0 * self.T_mult ** (n)
            else:
                self.T_i = self.T_0
                self.T_cur = epoch
                
        self.eta_max = self.base_eta_max * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction='mean', device='cuda'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha.to(device) if alpha is not None else None
        self.reduction = reduction
        self.device = device

    def forward(self, inputs, targets):
        ce_loss = nn.functional.cross_entropy(inputs, targets, reduction='none', ignore_index=-100)

        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.alpha is not None:
            focal_loss = self.alpha[targets] * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        elif self.reduction == 'none':
            return focal_loss
        else:
            raise ValueError("Invalid reduction option")

num_epochs = 30

model = models.efficientnet_b1(weights='DEFAULT')
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, 2)
for name, param in model.named_parameters():
    if "last_layer" not in name:
        param.requires_grad = False

start_time = datetime.now(pytz.timezone('Asia/Seoul')).strftime('%Y%m%d-%H%M%S')
name = f'{start_time}_{model.__class__.__name__}_b1_symptoms.pt'
writer = SummaryWriter(log_dir=f'drharu/ML/diagnosis/runs/{name}')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = FocalLoss(gamma=2, alpha=class_weights, reduction='mean')
optimizer = torch.optim.AdamW(model.parameters(), lr=0, weight_decay=1e-5)
scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=10, T_mult=1, eta_max=1e-3,  T_up=10, gamma=1e-1)

for epoch in range(num_epochs):
    for phase in ['train', 'val']:
        model.train() if phase == 'train' else model.eval()
        dataloader = dataloader[phase]

        total_loss = 0.0
        correct = 0
        total = 0
        all_predicted = []
        all_labels = []

        for inputs, labels in tqdm(dataloader, desc=f'{phase.capitalize()} Epoch {epoch + 1}/{num_epochs}', unit='batch'):
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.set_grad_enabled(phase == 'train'):
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

                total_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                all_predicted.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(dataloader)
        accuracy = correct / total
        writer.add_scalar(f'Loss/{phase}', avg_loss, epoch)
        writer.add_scalar(f'Accuracy/{phase}', accuracy, epoch)

        if phase == 'val':
            current_f1_score = f1_score(np.array(all_labels), np.array(all_predicted), average='binary')
            current_auc_roc = roc_auc_score(np.array(all_labels), np.array(all_predicted))

            writer.add_scalar('F1 Score/valid', current_f1_score, epoch)
            writer.add_scalar('AUC-ROC/valid', current_auc_roc, epoch)

            if current_f1_score > best_f1_score:
                best_f1_score = current_f1_score
                torch.save(model.state_dict(), name)

    lr_value = scheduler.get_last_lr()[0]
    writer.add_scalar('LearningRate', lr_value, epoch)
    
    scheduler.step()

writer.close()
torch.cuda.empty_cache()

Train Epoch 1/50:   0%|          | 0/2712 [00:00<?, ?batch/s]

Train Epoch 1/50: 100%|██████████| 2712/2712 [1:14:57<00:00,  1.66s/batch]
Val Epoch 1/50: 100%|██████████| 678/678 [20:49<00:00,  1.84s/batch]
Train Epoch 2/50: 100%|██████████| 2712/2712 [1:01:36<00:00,  1.36s/batch]
Val Epoch 2/50: 100%|██████████| 678/678 [20:51<00:00,  1.85s/batch]
Train Epoch 3/50: 100%|██████████| 2712/2712 [1:01:36<00:00,  1.36s/batch]
Val Epoch 3/50: 100%|██████████| 678/678 [20:39<00:00,  1.83s/batch]
Train Epoch 4/50: 100%|██████████| 2712/2712 [1:01:45<00:00,  1.37s/batch]
Val Epoch 4/50: 100%|██████████| 678/678 [20:48<00:00,  1.84s/batch]
Train Epoch 5/50: 100%|██████████| 2712/2712 [1:01:26<00:00,  1.36s/batch]
Val Epoch 5/50: 100%|██████████| 678/678 [20:47<00:00,  1.84s/batch]
Train Epoch 6/50:  66%|██████▌   | 1790/2712 [40:50<20:54,  1.36s/batch]

In [None]:
%%time
base_path = 'C:\Users\user\Desktop\pythonProject\skin\01. data\2.Validation'
folder_name = '일반'

processor = PathLabelProcessor(base_path=base_path, folder_name=folder_name)

data = processor.labeled_image_paths
class_weights = processor.class_weights

In [None]:
%%time
transform = {'test': transforms.Compose([transforms.Resize((380, 380)),
                                        transforms.ToTensor(),
                                        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])}
test_size = None
seed = 42
batch_size = 128
shuffle = False
num_workers = os.cpu_count()

dataloader = ImageDataset(data=data,
                          transform=transform,
                          test_size=test_size,
                          seed=seed,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          num_workers=num_workers)

state_dict_path = '20231229-102149_EfficientNet_b1_개_안검종양.pth'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
model = models.efficientnet_b1()
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, 2)

model.load_state_dict(torch.load(state_dict_path))
model.to(device)

model.eval()
predictions = []
labels = []
probabilities = []

with torch.no_grad():
    for inputs, targets in tqdm(dataloader.dataloader['test']):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        
        _, predicted = torch.max(outputs, 1)

        predictions.extend(predicted.cpu().numpy())
        labels.extend(targets.cpu().numpy())
        probabilities.extend(torch.nn.functional.softmax(outputs, dim=1).cpu().numpy())

predictions, labels, probabilities = classify()
cm = confusion_matrix(labels, predictions)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average='weighted')

probabilities = np.array(probabilities)
min_probs = np.min(probabilities)
max_probs = np.max(probabilities)
std_probs = np.std(probabilities)
mean_probs = np.mean(probabilities)

print('Evaluation Results:')
print(f'Confusion Matrix:\n{cm}')
print(f'Accuracy: {accuracy*100:.2f}%')
print(f'F1 Score: {f1*100:.2f}%')
print(f'Mean Probability: {mean_probs*100:.2f}%')
print(f'Max Probability: {max_probs*100:.2f}%')
print(f'Min Probability: {min_probs*100:.2f}%')
print(f'Standard Deviation of Probabilities: {std_probs:.4f}')