## Обучение модели

Можно также запустить на [Colab](https://colab.research.google.com/drive/1rvnfzs4RFacs28ulCl63FRZu4nZzBsCn?usp=sharing)

In [1]:
import argparse
import os
import zipfile
from collections.abc import Callable
from pathlib import Path, PosixPath
from typing import Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from torchvision.models import Inception_V3_Weights
from tqdm import tqdm

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbalakinakate2022[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
"""Load dataset from W&B."""
DIR = './data/'
DIR_ZIP = './artifacts/my-dataset:v0/'

run = wandb.init(project="pipeline_competition")
artifact = run.use_artifact('balakinakate2022/pipeline_competition/my-dataset:v0', type='dataset')
artifact.download()

with zipfile.ZipFile(DIR_ZIP+"shift-cv-winter-2023.zip", 'r') as zip_ref:
    zip_ref.extractall(DIR)


[34m[1mwandb[0m: Downloading large artifact my-dataset:v0, 150.47MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.3


In [4]:
"""Create model."""

def get_model(): 
    model_inception_v3 = models.inception_v3(weights=Inception_V3_Weights.IMAGENET1K_V1)
    model_inception_v3.aux_logits = False
    # num_features - размерность вектора фич, поступающего на вход FC
    num_features = 2048
    # n_classes - количество классов, которые будет предсказывать наша модель
    n_classes = 2
    # Заменяем Fully-Connected слой на наш линейный классификатор
    model_inception_v3.fc = nn.Linear(in_features=num_features, out_features=n_classes)
    model_inception_v3.AuxLogits.fc = nn.Linear(768, 2)

    return model_inception_v3


In [5]:
"""Prepare datasets."""

# режимы датасета 
DATA_MODES = ['train', 'val', 'test']
# все изображения масштабируем к размеру 299*299 px
RESCALE_SIZE = 299

class CastomDataset(Dataset):
    """
    Датасет картинок, который паралельно подгружает их из папок
    производит скалирование и превращение в торчевые тензоры
    """
    def __init__(self, files: np.array,  mode: str, data_labels: pd.core.frame.DataFrame = None, 
                 transform: transforms.Compose=None):
        """
        Конструктор датасета.

        Args:
            files (np.array): список путей до изображений
            mode (str): тип датасета из ['train', 'val', 'test']
            data_labels (pd.core.frame.DataFrame, optional): _description_. Defaults to None.
            transform (transforms.Compose, optional): преобразования датасета

        Raises:
            NameError: возникает в случае неправильного типа датамета
        """
        super().__init__()
        # список файлов для загрузки
        self.files = sorted(files)
        # режим работы
        self.mode = mode
        self.transform = transform
        self.data_labels = data_labels

        if self.mode not in DATA_MODES:
            print(f"{self.mode} is not correct; correct modes: {DATA_MODES}")
            raise NameError
        self.len_ = len(self.files)

        # загружем метки файлов
        if self.mode != 'test':
            self.labels = torch.tensor([np.array(self.data_labels[self.data_labels.iloc[:, 0] == path.name].iloc[:,1])[0] \
                        for path in self.files],dtype=torch.long)  
                        
    def __len__(self) -> int:
        """
        Количество элементов в датасете.

        Returns:
            int: количество элементов 
        """
        return self.len_
    
    def load_sample(self, file: PosixPath) -> Image.Image:
        """
        Загружает изображение, находящееся по пути file.

        Args:
            file (PosixPath): путь до изображения

        Returns:
            Image.Image: изображение
        """
        image = Image.open(file)
        image.load()
        return image

    def __getitem__(self, index: int) -> Tuple[np.array, int]:
        """
        Возвращает элемент датасета.

        Args:
            index (int): индекс элемента датасета

        Returns:
            Tuple[np.array, int]: изображение и размыто/неразмыто
        """
        x = self.load_sample(self.files[index])
        x = self._prepare_sample(x)
        if self.transform:
            x = self.transform(x)
        if self.mode == 'test':
            return x
        else:
            y = self.labels[index]
            return x, y
        
    def _prepare_sample(self, image: Image.Image) -> np.array:
        """
        Уменьшение размера изображения.

        Args:
            image (Image.Image): входящее изображение

        Returns:
            np.array: уменьшенное изображение
        """
        image = image.resize((RESCALE_SIZE, RESCALE_SIZE))
        return np.array(image)
    

def get_datasets() -> Tuple[CastomDataset, CastomDataset, CastomDataset]:
    """
    Create datasets for train, validate, predict.

    Returns:
        Tuple[CastomDataset, CastomDataset, CastomDataset]: train_dataset, val_dataset, test_dataset
    """
    DIR = './data/'
    TRAIN_DIR = Path(DIR + 'train/train')
    TEST_DIR = Path(DIR + 'test/test')

    # load_from_WB()

    train_val_files = list(TRAIN_DIR.rglob('*.jpg'))
    test_files = list(TEST_DIR.rglob('*.jpg'))

    data_labels = pd.read_csv(DIR + 'train.csv')
    data_labels[['blur']] = data_labels[['blur']].astype('long')

    transform_train = transforms.Compose([
                transforms.ToPILImage(),
                transforms.RandomVerticalFlip(0.5),
                transforms.RandomHorizontalFlip(0.5),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])

    transform_test = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])

    train_val_files = list(TRAIN_DIR.rglob('*.jpg'))
    test_files = list(TEST_DIR.rglob('*.jpg'))

    train_val_labels = [np.array(data_labels[data_labels.iloc[:, 0] == path.name].iloc[:,1])[0] for path in train_val_files]
    train_files, val_files = train_test_split(train_val_files, test_size=0.25, stratify=train_val_labels)

    train_dataset = CastomDataset(train_files, data_labels=data_labels, mode='train', transform=transform_train)
    val_dataset = CastomDataset(val_files, data_labels=data_labels, mode='val', transform=transform_test)
    test_dataset = CastomDataset(test_files, mode='test', transform=transform_test)

    return train_dataset, val_dataset, test_dataset


In [6]:
"""Functions for training and validation."""

def seed_everything(seed: int):
    """
    Make default settings for random values.

    Args:
        seed (int): seed for random
    """
    import os
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True 
    # будет работать - если граф вычислений не будет меняться во время обучения
    torch.backends.cudnn.benchmark = True  # оптимизации


def model_learning(    
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    criterion: Callable,
    epochs: int,
    batch_size: int,
    device: Callable,
    ):
    """
    Make learning of model for epochs.

    Args:
        
        model: current model
        optimizer: optimizer for this learning
        criterion: loss function for this learning
        epochs: number of epochs
        batch_size: size of batch
        device: set 'cpu' or 'cuda'

    Returns: 
        dicts with losses and accuracies
    """

    train_dataset, val_dataset, _ = get_datasets()
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    best_val_loss = 1
    best_val_acc = 0
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"
    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, train_loader, criterion, optimizer, device)
            print("loss", train_loss)
            val_loss, val_acc = eval_epoch(model, val_loader, criterion, device)
            # если loss и acc на val, улучшили показатели, сохраняем модель,
            # для будущих предсказаний
            if best_val_loss >= val_loss and best_val_acc <= val_acc:
                if (os.path.exists('./outs') == False):
                    os.mkdir('./outs')

                best_val_loss = val_loss
                best_val_acc = val_acc
                torch.save(model.state_dict(), './outs/best_model.pth')
                print(f"\n\nSave model's completed on {epoch+1} epoch's")

            wandb.log({"train_loss": train_loss, "train_acc": train_acc, 
                       "val_loss": val_loss, "val_acc": val_acc, 
                       "epoch": epoch})
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
               


def fit_epoch(
        model: nn.Module,
        train_loader: DataLoader,
        criterion: Callable,
        optimizer: torch.optim.Optimizer,
        device: Callable
    ) -> Tuple[float, float]:
    """
    Проводим обучение на одном баче.

    Args:
        model (nn.Module): используемая модель
        train_loader (DataLoader): даталоадер для обучения
        criterion (Callable): функция потерь
        optimizer (torch.optim.Optimizer): оптимайзер
        device (Callable): 'cpu' или 'cuda'

    Returns:
        Tuple[float, float]: loss и accuracy
    """
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_data += inputs.size(0)
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc


def eval_epoch(
        model: nn.Module, 
        val_loader: DataLoader, 
        criterion: Callable, 
        device: Callable
    ) -> Tuple[float, float]:
    """
    Проводим оценивание на одном баче.
    Args:
        model (nn.Module): используемая модель
        val_loader (DataLoader): даталоадер для оценивания
        criterion (Callable): функция потерь
        device (Callable): 'cpu' или 'cuda'

    Returns:
        Tuple[float, float]: loss и accuracy
    """
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc


def prediction(
        model: nn.Module, 
        test_loader: DataLoader, 
        device: Callable
    ) -> np.ndarray:
    """
    Определение класса для набора из test_loader.
    Args:
        model: модель для вычислений
        test_loader: набор для определения класса
        device: "cpu" или "cuda"

    Returns:
        numpy.ndarray: _description_
    """
    with torch.no_grad():
        logits = []
    
        for inputs in test_loader:
            inputs = inputs.to(device)
            model.eval()
            outputs = model(inputs).cpu()
            logits.append(outputs)
            
    probs = nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    return probs

In [7]:
"""Train model."""

def  train_model( 
        config: dict,
        device: Callable = torch.device('cpu'),
    ):
    """
    Build all together: initialize the model,
    optimizer and loss function.

    Args:
        batch_size (int): set batch size
        epochs (int): number of epochs
        lr (float): learning rate
        seed (int): seed for randoms
        device : set "cpu" or "cuda"
    """

    wandb.login()
    with wandb.init(project="pipeline_competition",config=config):
        config = wandb.config
        seed_everything(config.seed)
        model = get_model().to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
        criterion = nn.CrossEntropyLoss()
        model_learning(model, optimizer, criterion, epochs=config.epochs,
                       batch_size=config.batch_size, device=device)

architecture="Inception_V3"
dataset="shift-cv-winter-2023"
epochs=1
batch_size=4
lr=0.001
seed=42

def train():
    """ Определение гиперпараметров, запуск обучения. """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    config_for_training = dict(
        architecture = architecture,  
        dataset = dataset,
        epochs = epochs,
        batch_size = batch_size,
        lr= lr,
        seed = seed
    )
    train_model(config_for_training, device=device)


In [8]:
train()




epoch:   0%|          | 0/1 [00:00<?, ?it/s]

loss 0.6290224368000651


epoch: 100%|██████████| 1/1 [01:30<00:00, 90.97s/it]



Save model's completed on 1 epoch's

Epoch 001 train_loss: 0.6290     val_loss 0.6248 train_acc 0.6782 val_acc 0.6577





0,1
epoch,▁
train_acc,▁
train_loss,▁
val_acc,▁
val_loss,▁

0,1
epoch,0.0
train_acc,0.67818
train_loss,0.62902
val_acc,0.65766
val_loss,0.62481


In [None]:
sweep_configuration = {
        'method': 'random',
        'name': 'sweep',

        'metric': {
            'goal': 'maximize', 
            'name': 'val_acc_epoch'
            },
        'parameters': {
            'batch_size': {'values': [2, 8, 32]},
            'epochs': {'values': [10]},
            'lr': {'values': [0.003, 0.0003, 0.00003]}
        },
        "function": 'train'
    }

sweep_id = wandb.sweep(sweep=sweep_configuration, project='pipeline_competition')

def sweep_func():
    """Подбор гиперпараметров с помощью sweep W&B."""
    run = wandb.init()

    wandb.init(project="pipeline")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = get_model().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=wandb.config.lr)
    criterion = nn.CrossEntropyLoss()
    model_learning(model, optimizer, criterion, epochs=wandb.config.epochs,
                    batch_size=wandb.config.batch_size, device=device)

wandb.agent(sweep_id, function=sweep_func, count=9)




Create sweep with ID: 6hhkplnj
Sweep URL: https://wandb.ai/balakinakate2022/pipeline_competition/sweeps/6hhkplnj


[34m[1mwandb[0m: While tearing down the service manager. The following error has occurred: [Errno 32] Broken pipe
[34m[1mwandb[0m: Agent Starting Run: 9srag0h0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	lr: 3e-05




VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666866761665915, max=1.0)…

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

loss 0.580927411595861


epoch:  10%|█         | 1/10 [01:14<11:07, 74.21s/it]



Save model's completed on 1 epoch's

Epoch 001 train_loss: 0.5809     val_loss 0.3772 train_acc 0.6942 val_acc 0.8634
loss 0.21427644156777226


epoch:  20%|██        | 2/10 [02:21<09:21, 70.15s/it]



Save model's completed on 2 epoch's

Epoch 002 train_loss: 0.2143     val_loss 0.1234 train_acc 0.9229 val_acc 0.9655
loss 0.10675372146778517


epoch:  30%|███       | 3/10 [03:28<08:00, 68.68s/it]



Save model's completed on 3 epoch's

Epoch 003 train_loss: 0.1068     val_loss 0.0911 train_acc 0.9630 val_acc 0.9730
