# Data‑Scaling Study for CAD Text OCR with PaddleOCR

This starter notebook walks you through:
1. Preparing a labelled **crop‑level** dataset from CAD drawings
2. Fine‑tuning the PaddleOCR recognition model on incrementally larger subsets
3. Plotting a learning curve to find the data‑saturation point

> **Tip** Run each section sequentially after filling in paths relevant to your environment.

In [2]:
# --- 1. Environment setup ---------------------------------------------------
# Uncomment the next line if PaddleOCR isn't installed in your kernel
# !pip install --upgrade paddlepaddle paddleocr matplotlib pandas scikit-learn tqdm

import os, json, random, shutil, time, pathlib, pickle
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# PaddleOCR imports (lazy – comment out if you want to test without training)
from paddleocr import PaddleOCR

In [None]:
os.path.getsize(path)¶


In [None]:

"PP-OCRv4_server_rec"

"PP-OCRv5_server_rec"


In [3]:
# --- 2. Project paths --------------------------------------------------------
DATA_ROOT = Path('data/cad_crops')          # folder with images
ANNOT_CSV = Path('data/labels.csv')         # CSV: image_path,transcription
MODEL_DIR = Path('models/')                 # where fine‑tuned checkpoints go
RESULTS_CSV = Path('results_scaling.csv')   # learning‑curve log

# Make sure directories exist
MODEL_DIR.mkdir(parents=True, exist_ok=True)

In [38]:
os.listdir("data/cad_crops")

['d17da87d-7c46-435a-b37e-85719ffdcc84_crop_0.gt.txt',
 '3_00845_crop_0.gt.txt',
 '4_04257_crop_0.gt.txt',
 '4_06143_crop_0.gt.txt',
 '4_01673_crop_1.gt.txt',
 '2_00493_crop_0.gt.txt',
 '1_02138_crop_1.gt.txt',
 '4_01393_crop_0.jpg',
 '1_02138_crop_0.jpg',
 '0_12000_crop_0.jpg',
 '4_06556_crop_0.jpg',
 '3_01064_crop_0.jpg',
 '0_07231_crop_0.gt.txt',
 '3b1413f4-8fb5-4dd9-b2a7-813305fcf319_crop_10.gt.txt',
 '72ca758c-51c2-4d16-9303-bcd05a6969e8_crop_2.gt.txt',
 '9bb674ff-bfa1-4137-9887-4c53885679a1_crop_0.gt.txt',
 '6_00415_crop_0.jpg',
 '0_03967_crop_0.gt.txt',
 '4_03549_crop_0.jpg',
 '4_02755_crop_0.jpg',
 'c52fb089-f312-4688-b9a0-9017d357557a_crop_0.jpg',
 '4a0e6c10-77c7-4ca4-a92b-569e7edcc325_crop_0.jpg',
 '2_02063_crop_0.jpg',
 '7f085b86-db90-4aa4-838e-057c65e6e1d2_crop_3.gt.txt',
 '11dc141d-b526-48e6-bdcc-7078ef257e78_crop_0.jpg',
 '29d552ae-018d-4cec-ad19-f0ad0d80abdb_crop_0.jpg',
 '0_08392_crop_0.gt.txt',
 '4_00851_crop_0.gt.txt',
 '4f9971ea-377c-44fa-9c32-52c44d235b47_crop_0.gt.

In [37]:


print(len([name for name in os.listdir("data/cad_crops")]))


file_count = len([name for name in os.listdir("data/cad_crops")])


batches = file_count / 367


print(f"Batches: {batches}")


36777
Batches: 100.20980926430518


In [4]:
# --- 3. Load annotations -----------------------------------------------------
df = pd.read_csv(ANNOT_CSV)
print(f'Total labelled crops: {len(df):,}')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/labels.csv'

In [None]:
# --- 4. Train/Val/Test split -------------------------------------------------
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

print(f'Train: {len(train_df):,}  |  Val: {len(val_df):,}  |  Test: {len(test_df):,}')

In [None]:
# Utility: save a subset CSV for PaddleOCR training ---------------------------
def save_subset(df_subset, name):
    subset_path = ANNOT_CSV.parent / f'{name}.csv'
    df_subset.to_csv(subset_path, index=False)
    return subset_path

In [None]:
# --- 5. Fine‑tuning function -------------------------------------------------
def finetune_paddleocr(train_csv, val_csv, out_dir, epochs=20, lr=1e-4):
    """Minimal wrapper around PaddleOCR CLI training.
    Replace this with your preferred training loop or the official tools.
    """
    # NOTE: This is a placeholder. Insert your own training invocation here.
    # You could call `paddleocr/tools/train.py -c configs/rec/PP-OCRv4/...` programmatically
    # and parse logs to extract best‑val accuracy.
    
    # Simulate training time & accuracy for demo purposes
    import random, time
    time.sleep(2)  # pretend we trained
    val_acc = random.uniform(0.7, 0.93)  # dummy accuracy
    return val_acc

In [None]:
# --- 6. Data‑scaling experiment --------------------------------------------
subset_sizes = [100, 300, 600, 1000, 2000, 4000, 8000]
results = []

for n in subset_sizes:
    print(f'\n=== Training on {n:,} samples ===')
    # Sample n rows from train_df (or use all if n exceeds size)
    subset_df = train_df.sample(n=min(n, len(train_df)), random_state=42)
    subset_csv = save_subset(subset_df, f'train_{n}')
    val_csv    = save_subset(val_df, 'val')  # use full val set

    acc = finetune_paddleocr(subset_csv, val_csv, MODEL_DIR / f'model_{n}')
    print(f'Val accuracy: {acc*100:.2f}%')
    results.append({'subset': n, 'val_acc': acc})

# Save results
pd.DataFrame(results).to_csv(RESULTS_CSV, index=False)
results

In [None]:
# --- 7. Plot learning curve --------------------------------------------------
df_res = pd.read_csv(RESULTS_CSV)

plt.figure(figsize=(6,4))
plt.plot(df_res['subset'], df_res['val_acc']*100, marker='o')
plt.title('Learning curve: CAD OCR fine‑tuning')
plt.xlabel('Number of annotated image crops')
plt.ylabel('Validation accuracy (%)')
plt.grid(True)
plt.tight_layout()
plt.show()

## 8 Results interpretation & next steps

* **Look for the elbow:** where accuracy gain per data‑doubling drops below ~0.5 pp.
* If the curve is still rising steeply at 8 k samples, plan to label more or use synthetic augmentation.
* Otherwise, focus on data quality: balanced glyph coverage, realistic noise/blur augmentations, etc.

Good luck, and happy fine‑tuning!