<a href="https://colab.research.google.com/github/cs-iuu/ocr-2025-fall-cv/blob/main/notebooks/12.Mongolian_OCR_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mongolian OCR Starter

## Setup

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split


## 1. Preprocessing

In [2]:
def preprocess_image(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # adaptive thresholding works well for documents
    thresh = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        25, 15
    )

    # noise removal
    kernel = np.ones((3, 3), np.uint8)
    clean = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)

    return clean


## 2. Character Segmentation

In [3]:
def segment_characters(binary_img):
    contours, hierarchy = cv2.findContours(
        binary_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )

    char_regions = []

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # filter out noise
        if h > 10 and w > 5:
            char_regions.append(binary_img[y:y+h, x:x+w])

    # sort left-to-right
    char_regions = sorted(char_regions, key=lambda x: x.shape[1])

    return char_regions


## 3. Resize

In [4]:
def prepare_char_for_cnn(char_img):
    resized = cv2.resize(char_img, (28, 28))
    normalized = resized.astype("float32") / 255.0
    return normalized.reshape(28, 28, 1)


## 4. Build a CNN

In [5]:
num_classes = 35  # adjust based on Cyrillic letters used

model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## 5. Load the dataset
Expected folder structure

```
data/
  А/
    img001.png
    img002.png
  Б/
  В/
  Г/
  ...
```

In [None]:
def load_dataset(root_dir="data"):
    images = []
    labels = []
    label_map = {}

    for i, letter in enumerate(sorted(os.listdir(root_dir))):
        label_map[i] = letter
        folder = os.path.join(root_dir, letter)
        for file in os.listdir(folder):
            img_path = os.path.join(folder, file)
            img = preprocess_image(img_path)
            char_img = prepare_char_for_cnn(img)
            images.append(char_img)
            labels.append(i)

    return np.array(images), np.array(labels), label_map

X, y, label_map = load_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## 6. Train

In [None]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


## 7. Inference

In [None]:
def ocr_image(img_path):
    binary = preprocess_image(img_path)
    chars = segment_characters(binary)

    result = ""
    for c in chars:
        x = prepare_char_for_cnn(c)
        pred = model.predict(x.reshape(1,28,28,1))
        idx = pred.argmax()
        result += label_map[idx]

    return result
