# Обучение моделей

In [None]:
# обучение
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

from PIL import Image
from torch.utils.data import Dataset, DataLoader
from efficientnet_pytorch import EfficientNet
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# utils
import os
from tqdm.notebook import tqdm

# логирование и версионирование экспериментов
from clearml import Task, TaskTypes, OutputModel

In [None]:
# проверяем доступность видеодрайвера

print("Is CUDA available: ", torch.cuda.is_available())
print("CUDA version: ", torch.version.cuda)

In [None]:
# фиксируем гиперпараметры предстоящего эксперимента

batch_size = 64
learning_rate = 0.001
data_augment = '2'
num_epochs = 6

# будет также использоваться позднее при сохранении модели и чекпоинтов, обязательно менять перед началом нового эксперимента!
experiment_name = f'EffNetB7_batch_{batch_size}_lr_{learning_rate}_augment_{data_augment}_epochs_{num_epochs}' 

# создаем папку, куда будем сохранять чекпоинты, веса и метрики
exp_dir_path = f"experiments/{experiment_name}"
os.makedirs(exp_dir_path, exist_ok=True)

In [None]:
# устанавливаем соединение с сервером ClearML

%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml

# mfdp
%env CLEARML_API_ACCESS_KEY=%insert_access_key%
%env CLEARML_API_SECRET_KEY=%insert_secret_key%

In [None]:
# инициализируем новый ClearML task

task = Task.init(
        project_name='MyFirstDataProject',
        task_name=experiment_name,
        tags= [f'data_augmentation_{data_augment}', f'batch_size_{batch_size}', 
               f'learning_rate_{learning_rate}', f'num_epochs_{num_epochs}'],
        task_type=TaskTypes.training,
        auto_connect_frameworks={'pytorch':True},
        auto_connect_streams={'stdout': True, 'stderr': True}
    )

In [None]:
class ImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)
        self.files = []
        for class_name in self.classes:
            class_dir = os.path.join(root_dir, class_name)
            self.files += [(os.path.join(class_dir, f), class_name) for f in os.listdir(class_dir)]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        img_path, class_name = self.files[idx]
        image = Image.open(img_path).convert('L')
        if self.transform:
            image = self.transform(image)
        return image, self.classes.index(class_name)


In [None]:
transform = transforms.Compose([
    transforms.Resize((48, 48)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # normalize to [-1, 1] for each channel
])


In [None]:
train_data = ImageDataset('dataset/train_dataset_transformed_2', transform=transform)
test_data = ImageDataset('dataset/test_dataset/', transform=transform)


In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8)
test_loader = DataLoader(test_data, batch_size=128, shuffle=True, num_workers=4)


In [None]:
model = EfficientNet.from_pretrained('efficientnet-b7')
num_ftrs = model._fc.in_features
model._fc = nn.Linear(num_ftrs, len(train_data.classes))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("loading model to device")
model = model.to(device)
print("model loaded")


if torch.cuda.is_available():
    print(f'Using {torch.cuda.get_device_name(0)}')
else:
    print('Using CPU')

    
# переводим модель в training mode
model.train()  

# устанавливаем seed для воспроизводимости
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    
    
# определяем критерии остановки обучения если необходимо
patience = 5   # количество эпох без улучшения до остановки обучения 
best_loss = float('inf')
epochs_no_improve = 0



for epoch in range(10,15):
    running_loss = 0.0
    all_labels = []
    all_predictions = []
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    
    for i, (images, labels) in progress_bar:  
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()  

        outputs = model(images)  # forward pass
        loss = criterion(outputs, labels)  # compute the loss
        loss.backward()  # backward pass
        optimizer.step()  # update the weights

        # сохраняем лейблы и предикшены для расчета метрик
        _, predictions = torch.max(outputs, 1)
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

        running_loss += loss.item() * images.size(0)
        progress_bar.set_description(f"Epoch {epoch+1} Loss: {running_loss/(i+1):.2f}")

    # рассчитываем метрики
    precision = precision_score(all_labels, all_predictions, average=None)
    recall = recall_score(all_labels, all_predictions, average=None)
    f1 = f1_score(all_labels, all_predictions, average=None)
    avg_precision = precision_score(all_labels, all_predictions, average='macro')
    avg_recall = recall_score(all_labels, all_predictions, average='macro')
    avg_f1 = f1_score(all_labels, all_predictions, average='macro')
    accuracy = accuracy_score(all_labels, all_predictions)

    epoch_loss = running_loss / len(train_data)
    
    print(f"\nEpoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {avg_precision:.4f}, \
Recall: {avg_recall:.4f}, F1 Score: {avg_f1:.4f}")
    
    scheduler.step(epoch_loss)  # step the learning rate scheduler
    

    # проверка предварительной остановки обучения
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        epochs_no_improve = 0
        best_epoch = epoch+1
        model_saving_name = f"{exp_dir_path}/{experiment_name}_best.pth"
        
        # сохраняем лучшую модель
        torch.save(model.state_dict(), model_saving_name)
        
        
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print('Оставливаем обучение, модель перестала обучаться')
            break
    
    
    # сохраняем последний чекпоинт на случай прерывания обучения
    torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict' : scheduler.state_dict(),
    'loss': loss,
    }, f"{exp_dir_path}/{experiment_name}_checkpoint.pth")
    
    
    # логируем метрики в ClearML
    logger = task.get_logger()
    logger.report_scalar('Loss', 'train', epoch_loss, epoch)
    logger.report_scalar('Accuracy', 'train', accuracy, epoch)
    logger.report_scalar('Average Metrics/Precision', 'train', avg_precision, epoch)
    logger.report_scalar('Average Metrics/Recall', 'train', avg_recall, epoch)
    logger.report_scalar('Average Metrics/F1', 'train', avg_f1, epoch)
    for class_id, (p, r, f) in enumerate(zip(precision, recall, f1)):
        class_name = train_data.classes[class_id] 
        logger.report_scalar('Metrics by Class/Precision', class_name, p, epoch)
        logger.report_scalar('Metrics by Class/Recall', class_name, r, epoch)
        logger.report_scalar('Metrics by Class/F1', class_name, f, epoch)
    
    
    
    # Переключение модели в режим оценки
    model.eval()

    all_test_labels = []
    all_test_predictions = []
    running_loss = 0.0

    # Итерация по тестовому набору данных
    progress_bar = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
    for i, (images, labels) in progress_bar:
        images = images.to(device)
        labels = labels.to(device)

        # Предсказания модели
        with torch.no_grad():
            outputs = model(images)

        # Вычисление потерь
        loss = criterion(outputs, labels)
        running_loss += loss.item() * images.size(0)

        # Сохранение меток и предсказаний
        _, predictions = torch.max(outputs, 1)
        all_test_labels.extend(labels.cpu().numpy())
        all_test_predictions.extend(predictions.cpu().numpy())

    # Вычисление финальных метрик
    test_loss = running_loss / len(test_data)
    test_precision = precision_score(all_test_labels, all_test_predictions, average='macro')
    test_recall = recall_score(all_test_labels, all_test_predictions, average='macro')
    test_f1 = f1_score(all_test_labels, all_test_predictions, average='macro')
    test_accuracy = accuracy_score(all_test_labels, all_test_predictions)

    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, \
Test Recall: {test_recall:.4f}, Test F1 Score: {test_f1:.4f}")

    # Логирование метрик в ClearML
    logger = task.get_logger()
    logger.report_scalar('Loss', 'test', test_loss, epoch)
    logger.report_scalar('Accuracy', 'test', test_accuracy, epoch)
    logger.report_scalar('Average Metrics/Precision', 'test', test_precision, epoch)
    logger.report_scalar('Average Metrics/Recall', 'test', test_recall, epoch)
    logger.report_scalar('Average Metrics/F1', 'test', test_f1, epoch)

    # Логирование метрик в ClearML по классам
    test_precision_class = precision_score(all_test_labels, all_test_predictions, average=None)
    test_recall_class = recall_score(all_test_labels, all_test_predictions, average=None)
    test_f1_class = f1_score(all_test_labels, all_test_predictions, average=None)

    for class_id, (p, r, f) in enumerate(zip(test_precision_class, test_recall_class, test_f1_class)):
        class_name = test_data.classes[class_id]  # get class name from class id
        logger.report_scalar('Test Metrics by Class/Precision', class_name, p, epoch)
        logger.report_scalar('Test Metrics by Class/Recall', class_name, r, epoch)
        logger.report_scalar('Test Metrics by Class/F1', class_name, f, epoch)
    
    model.train() # переводим модель обратно в training mode

## Оценка результатов обучения на тестовой выборке, логирование в ClearML

In [None]:
# Загрузка лучшей сохраненной модели
model_path = model_saving_name
model.load_state_dict(torch.load(model_path))

# Переключение модели в режим оценки
model.eval()

all_labels = []
all_predictions = []
running_loss = 0.0

# Итерация по тестовому набору данных
progress_bar = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
for i, (images, labels) in progress_bar:
    images = images.to(device)
    labels = labels.to(device)

    # Предсказания модели
    with torch.no_grad():
        outputs = model(images)

    # Вычисление потерь
    loss = criterion(outputs, labels)
    running_loss += loss.item() * images.size(0)

    # Сохранение меток и предсказаний
    _, predictions = torch.max(outputs, 1)
    all_labels.extend(labels.cpu().numpy())
    all_predictions.extend(predictions.cpu().numpy())

# Вычисление финальных метрик
test_loss = running_loss / len(test_data)
test_precision = precision_score(all_labels, all_predictions, average='macro')
test_recall = recall_score(all_labels, all_predictions, average='macro')
test_f1 = f1_score(all_labels, all_predictions, average='macro')
test_accuracy = accuracy_score(all_labels, all_predictions)

print(f"Test Loss: {test_loss:.2f}, Test Accuracy: {test_accuracy:.2f}, Test Precision: {test_precision}, \
Test Recall: {test_recall}, Test F1 Score: {test_f1}")

# Логирование метрик в ClearML
logger = task.get_logger()
logger.report_scalar('Loss', 'test', test_loss, best_epoch-1)
logger.report_scalar('Accuracy', 'test', test_accuracy, best_epoch-1)
logger.report_scalar('Average Metrics/Precision', 'test', test_precision, best_epoch-1)
logger.report_scalar('Average Metrics/Recall', 'test', test_recall, best_epoch-1)
logger.report_scalar('Average Metrics/F1', 'test', test_f1, best_epoch-1)

# Логирование метрик в ClearML по классам
test_precision_class = precision_score(all_labels, all_predictions, average=None)
test_recall_class = recall_score(all_labels, all_predictions, average=None)
test_f1_class = f1_score(all_labels, all_predictions, average=None)

for class_id, (p, r, f) in enumerate(zip(test_precision_class, test_recall_class, test_f1_class)):
    class_name = test_data.classes[class_id]  # get class name from class id
    logger.report_scalar('Test Metrics by Class/Precision', class_name, p, best_epoch-1)
    logger.report_scalar('Test Metrics by Class/Recall', class_name, r, best_epoch-1)
    logger.report_scalar('Test Metrics by Class/F1', class_name, f, best_epoch-1)

# Сохранение результатов, построение confusion matrix

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

## Сохранение метрик в csv

In [None]:
# Создадим словарь для всех метрик
results_dict = {
    'test_loss': [test_loss],
    'test_accuracy': [test_accuracy],
    'test_precision_macro': [test_precision],
    'test_recall_macro': [test_recall],
    'test_f1_macro': [test_f1],
}

# Добавим в словарь метрики по классам
for class_id, (p, r, f) in enumerate(zip(test_precision_class, test_recall_class, test_f1_class)):
    class_name = test_data.classes[class_id]  # get class name from class id
    results_dict[f'{class_name}_precision'] = [p]
    results_dict[f'{class_name}_recall'] = [r]
    results_dict[f'{class_name}_f1'] = [f]

# Преобразуем словарь в DataFrame
results_df = pd.DataFrame(results_dict)

# Сохраняем результаты в CSV-файл
results_df.to_csv(f'{exp_dir_path}/{experiment_name}.csv', index=False)


## Вывод матрицы ошибок

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(all_test_labels, all_test_predictions)

# Transform it to a DataFrame for easier plotting
cm_df = pd.DataFrame(cm, index=train_data.classes, columns=train_data.classes)

plt.figure(figsize=(10, 7))

# Plot the matrix as a heatmap
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d')

plt.title("Confusion Matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig(f'{exp_dir_path}/{experiment_name}_confusion_matrix.png')
plt.show()


In [None]:
import numpy as np

# Compute the confusion matrix
cm = confusion_matrix(all_test_labels, all_test_predictions)

# Transform it to a DataFrame for easier plotting
cm_df = pd.DataFrame(cm, index=train_data.classes, columns=train_data.classes)

# Calculate percentage of correctly predicted emotions
percentage_correct = np.diagonal(cm) / np.sum(cm, axis = 1)
# Calculate percentage of true positives
percentage_true_positives = np.diagonal(cm) / np.sum(cm, axis = 0)

plt.figure(figsize=(10, 7))

# Plot the matrix as a heatmap
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d')

class_names_with_percentage = [f'{class_name}\n{percentage*100:.1f}%' for class_name, percentage in zip(train_data.classes, percentage_correct)]
class_names_with_percentage_x = [f'{class_name}\n{percentage*100:.1f}%' for class_name, percentage in zip(train_data.classes, percentage_true_positives)]


plt.yticks(ticks=np.arange(len(train_data.classes)) + 0.5, labels=class_names_with_percentage, va='center')
plt.xticks(ticks=np.arange(len(train_data.classes)) + 0.5, labels=class_names_with_percentage_x, va='center')

plt.title("Confusion Matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')

# Save the confusion matrix
plt.savefig('confusion_matrix.png')

plt.show()

## Логирование модели, завершение обучения

In [None]:
# Создаем новую OutputModel
output_model = OutputModel(
    task=task,
    config_dict={
        'num_epochs': num_epochs, 
        'early_stopping': True, 
        'classes': train_data.classes,  # информация о классах
        'batch_size': batch_size,
        'num_workers': 4,
        'data_augmented': data_augment
    }
)

# Обновляем состояние модели в OutputModel
output_model.update_weights(
    weights_filename=model_saving_name
)

# Обозначаем модель как лучшую
output_model.publish()

In [None]:
del model
torch.cuda.empty_cache()  # очищаем GPU

In [None]:
# завершаем и закрываем clearML task
task.completed()
task.close()