# Объединение моделей сегментции и OCR в единый пайплайн

В данном ноутбуке представлен baseline для пайплайна сегментации и OCR для чтения школьных тетрадей. Сегментация реализована с помощью фреймворка detectron2, OCR - с помощью модели CRNN.

Мы представляем подход, как объединить две модели сегментации и распознавания в одну систему, чтобы делать предсказание на целой странице тетради. В этом бейзлайне нет обучения. Мы используем модели, которые были обучены в бейзлайнах 1 и 2 этапов олимпиады. Участники могут использовать представленные нами ранее бейзлайны, переобучить модели на новых данных, и использовать их веса для текущего этапа.

### Установка библиотек

Установка библиотек, под которым запускается данный бейзлайн.

In [None]:
!wget https://storage.yandexcloud.net/datasouls-competitions/ai-nto-final-2022/data.zip

In [None]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/baseline_recognition.ipynb  
  inflating: __MACOSX/data/._baseline_recognition.ipynb  
  inflating: data/.DS_Store          
  inflating: __MACOSX/data/._.DS_Store  
  inflating: data/font.otf           
  inflating: __MACOSX/data/._font.otf  
  inflating: data/ocr-model-last.ckpt  
  inflating: __MACOSX/data/._ocr-model-last.ckpt  
  inflating: data/baseline_segmentation.ipynb  
  inflating: __MACOSX/data/._baseline_segmentation.ipynb  
   creating: data/train_segmentation/
   creating: data/train_recognition/
  inflating: data/baseline.ipynb     
  inflating: __MACOSX/data/._baseline.ipynb  
  inflating: data/segm-model_final.pth  
  inflating: __MACOSX/data/._segm-model_final.pth  
  inflating: data/evaluate.py        
  inflating: __MACOSX/data/._evaluate.py  
  inflating: data/train_segmentation/annotations_extended.json  
  inflating: data/train_segmentation/.DS_Store  
  inflating: __MACOSX/data/train_segmentation/._.DS_Sto

In [None]:
!nvcc --version

In [11]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
     |████████████████████████████████| 3.8 MB 1.4 MB/s            
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
     |████████████████████████████████| 6.5 MB 26.4 MB/s            
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
     |████████████████████████████████| 67 kB 2.7 MB/s             
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
     |████████████████████████████████| 895 kB 82.9 MB/s            
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.11.6 transformers-4.17.0


### Загрузим необходимые библиотеки для создания модели

In [7]:
import cv2
import random
from PIL import ImageFont, ImageDraw, Image
import json
import os
from tqdm import tqdm
# from shapely.geometry import Polygon
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import shutil

import torch, torchvision
import torch.nn as nn

import logging
logger = logging.getLogger('detectron2')
logger.setLevel(logging.CRITICAL)

Прежде чем переходить к загрузке данных посмотрим, доступны ли нам GPU-мощности. 

In [2]:
print('GPU: ' + str(torch.cuda.is_available()))

GPU: True


**TrOcr fine-tuning**

In [8]:
train_csv = pd.read_csv('./data/train_recognition/labels.csv')
train_csv = train_csv.sample(frac = 1)

train_data = dict(train_csv[['file_name','text']].values)

In [9]:
train_data = {i: train_data[i] for i in sorted(list(filter(lambda x: 'eng' in x, train_data.keys())))}

In [10]:
train_data = [(k, v) for k, v in train_data.items()]
print('train len', len(train_data))

split_coef = 0.85
train_len = int(len(train_data)*split_coef)

train_data_splitted = train_data[:train_len]
val_data_splitted = train_data[train_len:]

print('train len after split', len(train_data_splitted))
print('val len after split', len(val_data_splitted))

train len 14334
train len after split 12183
val len after split 2151


In [11]:
from torch.utils.data import Dataset

def collate_fn(batch):
    images, texts, enc_texts = zip(*batch)
    images = torch.stack(images, 0)
    text_lens = torch.LongTensor([len(text) for text in texts])
    enc_pad_texts = pad_sequence(enc_texts, batch_first=True, padding_value=0)
    return images, texts, enc_pad_texts, text_lens


def get_data_loader(
    transforms, json_path, root_path, tokenizer, batch_size, drop_last
):
    dataset = OCRDataset(json_path, root_path, tokenizer, transforms)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        num_workers=8,
    )
    return data_loader


class OCRDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df[idx][0]
        text = self.df[idx][1]
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
  
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids

        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
train_dataset = OCRDataset(root_dir='./data/train_recognition/images/',
                           df=train_data_splitted,
                           processor=processor)
eval_dataset = OCRDataset(root_dir='./data/train_recognition/images/',
                           df=val_data_splitted,
                           processor=processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

In [None]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")

In [12]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [16]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    fp16=True, 
    output_dir="./",
    logging_steps=2,
    save_steps=1000,
    eval_steps=500,
    num_train_epochs=9
)

In [17]:
from datasets import load_metric

cer_metric = load_metric("cer")

In [18]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [44]:
!pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting wandb
  Downloading wandb-0.12.11-py2.py3-none-any.whl (1.7 MB)
     |████████████████████████████████| 1.7 MB 1.2 MB/s            
Collecting promise<3,>=2.0
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.6-py2.py3-none-any.whl (144 kB)
     |████████████████████████████████| 144 kB 27.6 MB/s            
[?25hCollecting setproctitle
  Downloading setproctitle-1.2.2-cp37-cp37m-manylinux1_x86_64.whl (

In [29]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)
trainer.train()

Using amp half precision backend
