# [Отчет](https://api.wandb.ai/links/cowboy_bebop/rfspqu29)

### Быстрое повторение семинара / модификации

In [1]:
import matplotlib.pyplot as plt
import numpy as np

import torch
import tqdm
import json
import cv2
import os

from torch.nn            import Module, Sequential, Conv2d, AvgPool2d, GRU, Linear
from torch.utils.data    import Dataset, DataLoader
from torch.nn.functional import ctc_loss, log_softmax
from torchvision         import models
from torchvision import transforms as T
from torch.optim import lr_scheduler

from string import digits, ascii_uppercase

from glob import glob
import pandas as pd
import wandb
import gc


np.random.seed(42)

  warn(


In [2]:
PATH_TO_DATA = "./seminar/data/seminar_crnn_data/"  # Change to your path with unzipped data
config_path = os.path.join(PATH_TO_DATA, "config.json")
images_path = os.path.join(PATH_TO_DATA, "images")

assert os.path.isfile(config_path)
assert os.path.isdir(images_path)

with open(config_path, "rt") as fp:
    config = json.load(fp)

config_full_paths = []
for item in config:
    config_full_paths.append({"file": os.path.join(images_path, item["file"]),
                              "text": item["text"]})
seminar_config = config_full_paths

In [3]:
abc = "0123456789ABEKMHOPCTYX"  # this is our alphabet for predictions.
def compute_mask(text):
    """Compute letter-digit mask of text, e.g. 'E506EC152' -> 'LDDDLLDDD'.
    
    Args:
        - text: String of text. 
        
    Returns:
        String of the same length but with every letter replaced by 'L' and every digit replaced by 'D' 
        or None if non-letter and non-digit character met in text.
    """
    mask = []
    
    # YOUR CODE HERE
    for char in text:
        if char in digits:
            mask.append("D")
        elif char in ascii_uppercase:
            mask.append("L")
        else:
            return None
    # END OF YOUR CODE
    
    return "".join(mask)

assert compute_mask("E506EC152") == "LDDDLLDDD"
assert compute_mask("E123KX99") == "LDDDLLDD"
assert compute_mask("P@@@KA@@") is None

def check_in_alphabet(text, alphabet=abc):
    """Check if all chars in text come from alphabet.
    
    Args:
        - text: String of text.
        - alphabet: String of alphabet.
        
    Returns:
        True if all chars in text are from alphabet and False otherwise.
    """
    
    # YOUR CODE HERE
    for char in text:
        if char not in alphabet:
            return False
    # END OF YOUR CODE
    
    return True

assert check_in_alphabet("E506EC152") is True
assert check_in_alphabet("A123GG999") is False

def filter_data(config):
    """Filter config items keeping only ones with correct text.
    
    Args:
        - config: List of dicts, each dict having keys "file" and "text".
        
    Returns:
        Filtered list (config subset).
    """
    config_filtered = []
    for item in tqdm.tqdm(config):
        text = item["text"]
        mask = compute_mask(text)
        if check_in_alphabet(text) and (mask == "LDDDLLDD" or mask == "LDDDLLDDD"):
            config_filtered.append({"file": item["file"],
                                    "text": item["text"]})
    return config_filtered

In [4]:
seminar_config = filter_data(seminar_config)
print("Total items in data after filtering:", len(seminar_config))

100%|██████████| 41141/41141 [00:00<00:00, 730716.03it/s]

Total items in data after filtering: 31345





In [5]:
class RecognitionDataset(Dataset):
    """Class for training image-to-text mapping using CTC-Loss."""

    def __init__(self, config, alphabet=abc, transforms=None):
        """Constructor for class.
        
        Args:
            - config: List of items, each of which is a dict with keys "file" & "text".
            - alphabet: String of chars required for predicting.
            - transforms: Transformation for items, should accept and return dict with keys "image", "seq", "seq_len" & "text".
        """
        super(RecognitionDataset, self).__init__()
        self.config = config
        self.alphabet = alphabet
        self.image_names, self.texts = self._parse_root_()
        self.transforms = transforms

    def _parse_root_(self):
        image_names, texts = [], []
        for item in self.config:
            image_name = item["file"]
            text = item['text']
            texts.append(text)
            image_names.append(image_name)
        return image_names, texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        """Returns dict with keys "image", "seq", "seq_len" & "text".
        Image is a numpy array, float32, [0, 1].
        Seq is list of integers.
        Seq_len is an integer.
        Text is a string.
        """
        image = cv2.imread(self.image_names[item]).astype(np.float32) / 255.
        text = self.texts[item]
        seq = self.text_to_seq(text)
        seq_len = len(seq)
        output = dict(image=image, seq=seq, seq_len=seq_len, text=text)
        if self.transforms is not None:
            output = self.transforms(output)
        return output

    def text_to_seq(self, text):
        """Encode text to sequence of integers.
        
        Args:
            - String of text.
            
        Returns:
            List of integers where each number is index of corresponding characted in alphabet + 1.
        """
        
        # YOUR CODE HERE
        seq = [self.alphabet.find(c) + 1 for c in text]
        # END OF YOUR CODE
        
        return seq

In [6]:
class Resize(object):

    def __init__(self, size=(320, 64)):
        self.size = size

    def __call__(self, item):
        """Apply resizing.
        
        Args: 
            - item: Dict with keys "image", "seq", "seq_len", "text".
        
        Returns: 
            Dict with image resized to self.size.
        """
        
        # YOUR CODE HERE
        interpolation = cv2.INTER_AREA if self.size[0] < item["image"].shape[1] else cv2.INTER_LINEAR
        item["image"] = cv2.resize(item["image"], self.size, interpolation=interpolation)
        # END OF YOUR CODE
        
        return item

In [7]:
class RandomRotation:
    def __init__(self, max_angle, prob):
        self.max_angle = max_angle
        self.prob = prob

    def __call__(self, item):
        if np.random.random() < self.prob:
            angle = (np.random.random() * 2 - 1) * self.max_angle

            (h, w) = item['image'].shape[:2]
            (cX, cY) = (w // 2, h // 2)
            # rotate our image by 45 degrees around the center of the image
            M = cv2.getRotationMatrix2D((cX, cY), 45, 1.0)
            item['image'] = cv2.warpAffine(item['image'], M, (w, h))

        return item

In [8]:
class Normalize:
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, item):
        item['image'] = (item['image'] - self.mean) / self.std
        return item        


In [9]:
def collate_fn(batch):
    """Function for torch.utils.data.Dataloader for batch collecting.
    
    Args:
        - batch: List of dataset __getitem__ return values (dicts).
        
    Returns:
        Dict with same keys but values are either torch.Tensors of batched images or sequences or so.
    """
    images, seqs, seq_lens, texts = [], [], [], []
    for item in batch:
        images.append(torch.from_numpy(item["image"]).permute(2, 0, 1).float())
        seqs.extend(item["seq"])
        seq_lens.append(item["seq_len"])
        texts.append(item["text"])
    images = torch.stack(images)
    seqs = torch.Tensor(seqs).int()
    seq_lens = torch.Tensor(seq_lens).int()
    batch = {"image": images, "seq": seqs, "seq_len": seq_lens, "text": texts}
    return batch

In [10]:
class FeatureExtractor(Module):
    
    def __init__(self, input_size=(64, 320), output_len=20, model='resnet18'):
        super(FeatureExtractor, self).__init__()
        
        h, w = input_size
        resnet = getattr(models, model)(pretrained=True)
        self.cnn = Sequential(*list(resnet.children())[:-2])
        
        self.pool = AvgPool2d(kernel_size=(h // 32, 1))        
        self.proj = Conv2d(w // 32, output_len, kernel_size=1)
  
        self.num_output_features = self.cnn[-1][-1].bn2.num_features    
    
    def apply_projection(self, x):
        """Use convolution to increase width of a features.
        
        Args:
            - x: Tensor of features (shaped B x C x H x W).
            
        Returns:
            New tensor of features (shaped B x C x H x W').
        """
        x = x.permute(0, 3, 2, 1).contiguous()
        x = self.proj(x)
        x = x.permute(0, 2, 3, 1).contiguous()
        
        return x
   
    def forward(self, x):
        # Apply conv layers
        features = self.cnn(x)
        
        # Pool to make height == 1
        features = self.pool(features)
        
        # Apply projection to increase width
        features = self.apply_projection(features)
        
        return features

In [11]:
feature_extractor = FeatureExtractor(model='resnet50')
x = torch.randn(1, 3, 64, 320)
y = feature_extractor(x)
assert y.size() == (1, 1, 512, 20)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/denis/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [02:29<00:00, 686kB/s]


AssertionError: 

In [12]:
y.size()

torch.Size([1, 1, 2048, 20])

In [None]:
class SequencePredictor(Module):
    
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.3, bidirectional=False):
        super(SequencePredictor, self).__init__()
        
        self.num_classes = num_classes        
        self.rnn = GRU(input_size=input_size,
                       hidden_size=hidden_size,
                       num_layers=num_layers,
                       dropout=dropout,
                       bidirectional=bidirectional)
        
        fc_in = hidden_size if not bidirectional else 2 * hidden_size
        self.fc = Linear(in_features=fc_in,
                         out_features=num_classes)
    
    def _init_hidden(self, batch_size):
        """Initialize new tensor of zeroes for RNN hidden state.
        
        Args:
            - batch_size: Int size of batch
            
        Returns:
            Tensor of zeros shaped (num_layers * num_directions, batch, hidden_size).
        """
        num_directions = 2 if self.rnn.bidirectional else 1
        
        # YOUR CODE HERE
        h = torch.zeros(self.rnn.num_layers * num_directions, batch_size, self.rnn.hidden_size)
        # END OF YOUR CODE
        
        return h
        
    def _reshape_features(self, x):
        """Change dimensions of x to fit RNN expected input.
        
        Args:
            - x: Tensor x shaped (B x (C=1) x H x W).
        
        Returns:
            New tensor shaped (W x B x H).
        """
        
        # YOUR CODE HERE
        x = x.squeeze(1)
        x = x.permute(2, 0, 1)
        # END OF YOUR CODE
        
        return x
    
    def forward(self, x):
        x = self._reshape_features(x)
        
        batch_size = x.size(1)
        h_0 = self._init_hidden(batch_size)
        h_0 = h_0.to(x.device)
        x, h = self.rnn(x, h_0)
        
        x = self.fc(x)
        return x

In [None]:
class CRNN(Module):
    
    def __init__(self, alphabet=abc,
                 cnn_input_size=(64, 320), cnn_output_len=20, cnn_model='resnet18',
                 rnn_hidden_size=128, rnn_num_layers=2, rnn_dropout=0.3, rnn_bidirectional=False):
        super(CRNN, self).__init__()
        self.alphabet = alphabet
        self.features_extractor = FeatureExtractor(
            input_size=cnn_input_size, output_len=cnn_output_len, model=cnn_model,
        )
        self.sequence_predictor = SequencePredictor(
            input_size=self.features_extractor.num_output_features,
            hidden_size=rnn_hidden_size, num_layers=rnn_num_layers,
            num_classes=len(alphabet)+1, dropout=rnn_dropout,
            bidirectional=rnn_bidirectional
        )
    
    def forward(self, x):
        features = self.features_extractor(x)
        sequence = self.sequence_predictor(features)
        return sequence

In [None]:
def pred_to_string(pred, abc):
    seq = []
    for i in range(len(pred)):
        label = np.argmax(pred[i])
        seq.append(label - 1)
    out = []
    for i in range(len(seq)):
        if len(out) == 0:
            if seq[i] != -1:
                out.append(seq[i])
        else:
            if seq[i] != -1 and seq[i] != seq[i - 1]:
                out.append(seq[i])
    out = ''.join([abc[c] for c in out])
    return out

def decode(pred, abc):
    pred = pred.permute(1, 0, 2).cpu().data.numpy()
    outputs = []
    for i in range(len(pred)):
        outputs.append(pred_to_string(pred[i], abc))
    return outputs

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
def read_config(path_folder, name_as_text=False):
    config = []
    assert os.path.isdir(path_folder)

    for path in glob(os.path.join(path_folder, '*.png')):
        if not name_as_text:
            text = path.split('_')[-1][:-4]
        else:
            text = path.split('/')[-1].split('.')[0]

        config.append({'file': path, 'text': text})

    return config

In [None]:
simple_config = read_config('kaggle/train/train/simple/')
complex_config = read_config('kaggle/train/train/complex/')
test_config = read_config('kaggle/test/result/', name_as_text=True)

In [None]:
def save_checkpoint(model, filename):

    with open(filename, "wb") as fp:
        torch.save(model.state_dict(), fp)


def load_checkpoint(model, filename):

    with open(filename, "rb") as fp:
        state_dict = torch.load(fp, map_location="cpu")
    model.load_state_dict(state_dict)

In [None]:
def get_train_val_test_loaders(data_config, test_config, model_config):
    batch_size = model_config['batch_size']
    num_workers = model_config['num_workers']
    train_perc = model_config['train_perc']

    # Get mean and std
    # mean = np.asarray([0.52157311, 0.5122762 , 0.50537334])
    # std = np.asarray([0.29142887, 0.29255962, 0.29303916])

    # Transforms
    transforms_train = T.Compose([
        Resize(),
        # Normalize(mean, std),
        RandomRotation(15, 0.3),
    ])

    transforms_val = T.Compose([
        Resize(),
        # Normalize(mean, std),
    ])


    np.random.shuffle(data_config)
    train_size = int(len(data_config) * train_perc)
    config_train = data_config[:train_size]
    config_val   = data_config[train_size:]

    train_dataset = RecognitionDataset(config_train, transforms=transforms_train)
    val_dataset   = RecognitionDataset(config_val  , transforms=transforms_val)


    train_dataloader = DataLoader(train_dataset, 
                                batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, 
                                drop_last=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, 
                                batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, 
                                drop_last=False, collate_fn=collate_fn)
    

    test_dataset = RecognitionDataset(test_config, transforms=transforms_val)
    test_dataloader = DataLoader(
        test_dataset, 
        batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, 
        drop_last=False, collate_fn=collate_fn
    )

    return train_dataloader, val_dataloader, test_dataloader


In [None]:
def train_epoch_model(model, train_dataloader, optimizer):
    model.train()
    epoch_losses = []

    for j, b in enumerate(tqdm.tqdm(train_dataloader, total=len(train_dataloader))):
        images = b["image"].to(device)
        seqs_gt = b["seq"]
        seq_lens_gt = b["seq_len"]

        seqs_pred = model(images).cpu()
        log_probs = log_softmax(seqs_pred, dim=2)
        seq_lens_pred = torch.Tensor([seqs_pred.size(0)] * seqs_pred.size(1)).int()

        loss = ctc_loss(log_probs=log_probs,  # (T, N, C)
                        targets=seqs_gt,  # N, S or sum(target_lengths)
                        input_lengths=seq_lens_pred,  # N
                        target_lengths=seq_lens_gt)  # N

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_losses.append(loss.item())

        wandb.log({'train_loss': loss.item()})
    
    return model, np.mean(epoch_losses)

In [None]:
def test_model(model, val_dataloader):
    model.eval()
    val_losses = []
    for i, b in enumerate(tqdm.tqdm(val_dataloader, total=len(val_dataloader))):
        images = b["image"].to(device)
        seqs_gt = b["seq"]
        seq_lens_gt = b["seq_len"]

        with torch.no_grad():
            seqs_pred = model(images).cpu()
            
        log_probs = log_softmax(seqs_pred, dim=2)
        seq_lens_pred = torch.Tensor([seqs_pred.size(0)] * seqs_pred.size(1)).int()

        loss = ctc_loss(log_probs=log_probs,  # (T, N, C)
                        targets=seqs_gt,  # N, S or sum(target_lengths)
                        input_lengths=seq_lens_pred,  # N
                        target_lengths=seq_lens_gt)  # N

        val_losses.append(loss.item())

    return np.mean(val_losses)

In [None]:
def train_model(model, train_dataloader, val_dataloader, num_epochs, optimizer, sheduler, name):
    
    best_val_loss = np.infty

    for epoch in range(num_epochs):
        model, train_loss = train_epoch_model(model, train_dataloader, optimizer)
        val_loss = test_model(model, val_dataloader)
        
        print(f'train loss {train_loss:.4f} val loss {val_loss:.4f}')
        wandb.log({
            'train_loss_epoch': train_loss, 
            'val_loss_epoch': val_loss, 
            'epoch': epoch,
            'lr': sheduler.get_last_lr()[0]
        })

        if val_loss < best_val_loss:
            save_checkpoint(model, f'models/{name}_{val_loss:.4f}.pth')
            best_val_loss = val_loss

        sheduler.step()

    load_checkpoint(model, f'models/{name}_{best_val_loss:.4f}.pth')
    return model

In [None]:
def make_prediction(model, test_dataloader):
    model.eval()
    test_pred = {'index': [], 'label': []}

    for batch in test_dataloader:
        images = batch['image'].to(device)
        
        index = batch['text']

        with torch.no_grad():
            seqs_pred = model(images).cpu()
            
        texts_pred = decode(seqs_pred, model.alphabet)

        test_pred['index'] += index
        test_pred['label'] += texts_pred

    return pd.DataFrame(test_pred)

In [None]:
def run_experiment(seminar_config, simple_config, complex_config, test_config, model_config):
    
    # Wandb
    name = model_config['name']
    
    wandb.init(
        project="hw4",
        name=name,
        reinit=True,
        config=model_config
    )

    # Data
    all_config = seminar_config + simple_config + complex_config
    train_dataloader, val_dataloader, test_dataloader = get_train_val_test_loaders(all_config, test_config, model_config)


    # Create model
    model = CRNN(
        cnn_input_size=(model_config['cnn_height'], model_config['cnn_width']),
        cnn_model=model_config['cnn_model'],
        cnn_output_len=model_config['cnn_output_len'],
        rnn_hidden_size=model_config['rnn_hidden_size'],
        rnn_num_layers=model_config['rnn_num_layers'],
        rnn_bidirectional=model_config['rnn_bidirectional']
    )

    model.to(device)

    # Create optimizer
    optimizer = torch.optim.Adam(
        model.parameters(), 
        lr=model_config['lr'], 
        weight_decay=model_config['weight_decay']
    )

    sheduler = lr_scheduler.StepLR(
        optimizer, 
        step_size=model_config['sheduler_step'], 
        gamma=model_config['sheduler_gamma']
    )

    # Train model
    model = train_model(
        model, train_dataloader, val_dataloader, 
        model_config['num_epochs'], 
        optimizer, 
        sheduler,
        name,
    )
    
    gc.collect()
    torch.cuda.empty_cache()

    # Predict
    predictions = make_prediction(model, test_dataloader)
    predictions.to_csv(f'predictions/{name}.csv', index=None)

    wandb.finish()
    

In [None]:
model_config = {
    'name': 'Resnet50',
    'train_perc': 0.8,

    'num_epochs': 25,
    'batch_size': 256,
    'num_workers': 4,

    'cnn_height': 64,
    'cnn_width': 320,
    'cnn_model': 'resnet50',
    'cnn_output_len': 20,

    'rnn_hidden_size': 128, 
    'rnn_num_layers': 2,
    'rnn_dropout': 0.3,
    'rnn_bidirectional': False,

    'lr': 3e-4,
    'sheduler': 'StepLR',
    'sheduler_gamma': 0.5,
    'sheduler_step': 8,
    'weight_decay': 3e-4,
}

In [None]:
all_config = seminar_config + simple_config + complex_config

In [None]:
train_dataloader, val_dataloader, test_dataloader = get_train_val_test_loaders(
    all_config, test_config, model_config
)

In [None]:
run_experiment(seminar_config, simple_config, complex_config, test_config, model_config)

In [None]:
df = pd.read_csv(f'predictions/{model_config["name"]}.csv')

In [None]:
df['mask'] = df['label'].apply(compute_mask)
masks = []
df.query('(mask != "LDDDLLDD") and (mask != "LDDDLLDDD")').shape

(21, 3)