In [None]:
import os
import shutil

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import cv2
from PIL import Image

tqdm.pandas()

# 2.1 Hindi

Dataset Source: https://drive.google.com/open?id=1E5kI8CLoC-XffqQMTWwSpBIPp1Wb2tne

## 2.1.1 Merge Annotations 

In [None]:
ann_dir = "Data/Synthetic-Hindi/Annotation/{}"

In [None]:
ann_df = pd.DataFrame()
for i in tqdm(range(1, 25+1)):
    file_names = os.listdir(ann_dir.format(i))
    for file_name in tqdm(file_names, leave=False):
        local_file_path = os.path.join(ann_dir.format(i), file_name)
        df = pd.read_csv(local_file_path, sep=' ', header=None)
        df.columns = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text']
        df['folder_id'] = i
        df['file_id'] = file_name.split(".")[0]
        ann_df = ann_df.append(df, ignore_index=True)
print(ann_df.shape)
ann_df.head()

In [None]:
cols = ['folder_id', 'file_id', 'text','x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
ann_df = ann_df[cols]
ann_df.to_csv(file_path, index=False)

## 2.1.2 Filter improper data points

In [None]:
ann_df = pd.read_csv("Data/Annotations/Annotations-Hindi.csv")
characters_df = pd.read_csv("Data/Characters/Characters-Hindi.csv")
glyphs = list(characters_df['Glyph'].unique())
consonants = list(characters_df['Consonant'].unique())
vowels = characters_df[characters_df['Consonant']=="-"]['Character'].values.tolist()
characters = glyphs + consonants + vowels
print(len(characters))

In [None]:
def correct_text(text):

    corrected_text = "".join([character for character in text if character in characters])
    
    return corrected_text

In [None]:
ann_df['corrected_text'] = ann_df['text'].progress_apply(correct_text)
ann_df['num_removals'] = ann_df['text'].str.len() - ann_df['corrected_text'].str.len()
ann_df['image_id'] = ann_df['folder_id'].astype('str') + "/" + ann_df['file_id'].astype('str') + ".jpg"
coords_cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4']
ann_df[coords_cols] = ann_df[coords_cols].clip(lower=0)
ann_df.head()

In [None]:
print((ann_df['num_removals']>0).sum(), len(ann_df))

In [None]:
print(len(ann_df))
ann_df = ann_df[ann_df['num_removals']==0]
ann_df = ann_df.drop(columns=['corrected_text', 'num_removals'])
print(len(ann_df))

## 2.1.3 Split dataset

In [None]:
ann_df.groupby('folder_id').size()

In [None]:
train = ann_df[ann_df['folder_id'] <= 23]
print(train.shape)
train = train.sample(n=500000, random_state=0)
print(train.shape)

In [None]:
val = ann_df[ann_df['folder_id'] == 24]
print(val.shape)
val = val.sample(n=5000, random_state=0)
print(val.shape)

In [None]:
test = ann_df[ann_df['folder_id'] == 25]
print(test.shape)
test = test.sample(n=5000, random_state=0)
print(test.shape)

In [None]:
train_dir = "Data/Hindi/Train"
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)

val_dir = "Data/Hindi/Val"
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)
os.makedirs(val_dir)

test_dir = "Data/Hindi/Test"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)

In [None]:
def save_crops(data, data_dir, method=1):
    """
    Parameters
    ----------
    method: int
        1 - bounding rect
        2 - min area rect
    """
    coords_cols = ['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
    crop_size = (200, 50)
    for image_id, image_data in tqdm(data.groupby('image_id')):
        image_path = "Data/Synthetic-Hindi/Image/" + image_id
        image = cv2.imread(image_path)
        
        i = 0
        #for row_id, row in tqdm(image_data.iterrows(), leave=False, total=len(image_data)):
        for row_id, row in image_data.iterrows():
            coords = row[coords_cols].values
            coords = coords.reshape(4, 1, 2).astype(np.float32)
            
            if method==1:
                x, y, w, h = cv2.boundingRect(coords)
                crop = image[y:y+h, x:x+w]
            elif method==2:
                raise NotImplementedError()
            
            crop = cv2.resize(crop, crop_size)
            crop_path = f"{data_dir}/{row['text']}_{row['folder_id']}_{row['file_id']}_{i}.jpg"
            cv2.imwrite(crop_path, crop)
            i += 1

In [None]:
save_crops(train, train_dir)

In [None]:
save_crops(val, val_dir)

In [None]:
save_crops(test, test_dir)

# 2.2 Tamil

Dataset Source: https://drive.google.com/drive/u/0/folders/1hnNxuHbBBZrrI7Ee6FePTsUfW97qrJAS
(Tamil 1-30)

## 2.2.1 Merge Annotations 

In [None]:
ann_dir = "Data/Synthetic-Tamil/Annotation/{}"

In [None]:
file_path = "Data/Annotations/Annotations-Tamil.csv"
cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text', 'folder_id', 'file_id']
ann_df = pd.DataFrame(columns=cols)
ann_df.to_csv(file_path, index=False)
for i in tqdm(range(1, 30+1)):
    file_names = os.listdir(ann_dir.format(i))
    for file_name in tqdm(file_names, leave=False):
        local_file_path = os.path.join(ann_dir.format(i), file_name)
        df = pd.read_csv(local_file_path, sep=' ', header=None)
        df.columns = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text']
        df['folder_id'] = i
        df['file_id'] = file_name.split(".")[0]
        df.to_csv(file_path, index=False, header=None, mode='a')
        
ann_df = pd.read_csv(file_path)
print(ann_df.shape)
ann_df.head()

In [None]:
cols = ['folder_id', 'file_id', 'text','x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
ann_df = ann_df[cols]
ann_df.to_csv("Data/Annotations-Tamil.csv", index=False)

## 2.2.2 Filter improper data points

In [None]:
ann_df = pd.read_csv("Data/Annotations-Tamil.csv")
characters_df = pd.read_csv("Data/Characters/Characters-Tamil.csv")
glyphs = list(characters_df['Glyph'].unique())
consonants = list(characters_df['Consonant'].unique())
vowels = characters_df[characters_df['Consonant']=="-"]['Character'].values.tolist()
characters = glyphs + consonants + vowels
print(len(characters))

In [None]:
def correct_text(text):

    corrected_text = "".join([character for character in text if character in characters])
    
    return corrected_text

In [None]:
ann_df['corrected_text'] = ann_df['text'].progress_apply(correct_text)
ann_df['num_removals'] = ann_df['text'].str.len() - ann_df['corrected_text'].str.len()
ann_df['image_id'] = ann_df['folder_id'].astype('str') + "/" + ann_df['file_id'].astype('str') + ".jpg"
coords_cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4']
ann_df[coords_cols] = ann_df[coords_cols].clip(lower=0)
ann_df.head()

In [None]:
print((ann_df['num_removals']>0).sum(), len(ann_df))

In [None]:
print(len(ann_df))
ann_df = ann_df[ann_df['num_removals']==0]
ann_df = ann_df.drop(columns=['corrected_text', 'num_removals'])
print(len(ann_df))

## 2.2.3 Split dataset

In [None]:
ann_df.groupby('folder_id').size()

In [None]:
train = ann_df[ann_df['folder_id'] <= 28]
print(train.shape)
train = train.sample(n=500000, random_state=0)
print(train.shape)

In [None]:
val = ann_df[ann_df['folder_id'] == 29]
print(val.shape)
val = val.sample(n=5000, random_state=0)
print(val.shape)

In [None]:
test = ann_df[ann_df['folder_id'] == 30]
print(test.shape)
test = test.sample(n=5000, random_state=0)
print(test.shape)

In [None]:
train_dir = "Data/Tamil/Train"
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)

val_dir = "Data/Tamil/Val"
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)
os.makedirs(val_dir)

test_dir = "Data/Tamil/Test"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)

In [None]:
def save_crops(data, data_dir, method=1):
    """
    Parameters
    ----------
    method: int
        1 - bounding rect
        2 - min area rect
    """
    coords_cols = ['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
    crop_size = (200, 50)
    for image_id, image_data in tqdm(data.groupby('image_id')):
        image_path = "Data/Synthetic-Tamil/Image/" + image_id
        image = cv2.imread(image_path)
        
        i = 0
        #for row_id, row in tqdm(image_data.iterrows(), leave=False, total=len(image_data)):
        for row_id, row in image_data.iterrows():
            coords = row[coords_cols].values
            coords = coords.reshape(4, 1, 2).astype(np.float32)
            
            if method==1:
                x, y, w, h = cv2.boundingRect(coords)
                crop = image[y:y+h, x:x+w]
            elif method==2:
                raise NotImplementedError()
            
            crop = cv2.resize(crop, crop_size)
            crop_path = f"{data_dir}/{row['text']}_{row['folder_id']}_{row['file_id']}_{i}.jpg"
            cv2.imwrite(crop_path, crop)
            i += 1

In [None]:
save_crops(train, train_dir)

In [None]:
save_crops(val, val_dir)

In [None]:
save_crops(test, test_dir)

# 2.3 Malayalam

Dataset Source: https://drive.google.com/drive/u/0/folders/1hnNxuHbBBZrrI7Ee6FePTsUfW97qrJAS
(Malayalam 1-30)

## 2.3.1 Merge Annotations 

In [None]:
ann_dir = "Data/Synthetic-Malayalam/Annotation/{}"

In [None]:
file_path = "Data/Annotations/Annotations-Malayalam.csv"
cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text', 'folder_id', 'file_id']
ann_df = pd.DataFrame(columns=cols)
ann_df.to_csv(file_path, index=False)
for i in tqdm(range(1, 30+1)):
    file_names = os.listdir(ann_dir.format(i))
    for file_name in tqdm(file_names, leave=False):
        local_file_path = os.path.join(ann_dir.format(i), file_name)
        df = pd.read_csv(local_file_path, sep=' ', header=None)
        df.columns = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text']
        df['folder_id'] = i
        df['file_id'] = file_name.split(".")[0]
        df.to_csv(file_path, index=False, header=None, mode='a')
        
ann_df = pd.read_csv(file_path)
print(ann_df.shape)
ann_df.head()

In [None]:
cols = ['folder_id', 'file_id', 'text','x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
ann_df = ann_df[cols]
ann_df.to_csv("Data/Annotations-Malayalam.csv", index=False)

## 2.3.2 Filter improper data points

In [None]:
ann_df = pd.read_csv("Data/Annotations/Annotations-Malayalam.csv")
characters_df = pd.read_csv("Data/Characters/Characters-Malayalam.csv")
glyphs = list(characters_df['Glyph'].unique())
consonants = list(characters_df['Consonant'].unique())
vowels = characters_df[characters_df['Consonant']=="-"]['Character'].values.tolist()
characters = glyphs + consonants + vowels
print(len(characters))

In [None]:
def correct_text(text):

    corrected_text = "".join([character for character in text if character in characters])
    
    return corrected_text

In [None]:
ann_df['corrected_text'] = ann_df['text'].progress_apply(correct_text)
ann_df['num_removals'] = ann_df['text'].str.len() - ann_df['corrected_text'].str.len()
ann_df['image_id'] = ann_df['folder_id'].astype('str') + "/" + ann_df['file_id'].astype('str') + ".jpg"
coords_cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4']
ann_df[coords_cols] = ann_df[coords_cols].clip(lower=0)
ann_df.head()

In [None]:
print((ann_df['num_removals']>0).sum(), len(ann_df))

In [None]:
ann_df[ann_df['num_removals']>0][['text', 'corrected_text', 'image_id']].head()

In [None]:
print(len(ann_df))
ann_df = ann_df[ann_df['num_removals']==0]
ann_df = ann_df.drop(columns=['corrected_text', 'num_removals'])
print(len(ann_df))

## 2.3.3 Split dataset

In [None]:
ann_df.groupby('folder_id').size()

In [None]:
train = ann_df[ann_df['folder_id'] <= 20]
print(train.shape)
train = train.sample(n=400000, random_state=0)
print(train.shape)

val = ann_df[ann_df['folder_id'] == 21]
print(val.shape)
val = val.sample(n=5000, random_state=0)
print(val.shape)

test = ann_df[ann_df['folder_id'] == 22]
print(test.shape)
test = test.sample(n=5000, random_state=0)
print(test.shape)

In [None]:
train_dir = "Data/Malayalam/Train"
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)

val_dir = "Data/Malayalam/Val"
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)
os.makedirs(val_dir)

test_dir = "Data/Malayalam/Test"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)

In [None]:
def save_crops(data, data_dir, method=1):
    """
    Parameters
    ----------
    method: int
        1 - bounding rect
        2 - min area rect
    """
    coords_cols = ['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
    crop_size = (200, 50)
    for image_id, image_data in tqdm(data.groupby('image_id')):
        image_path = "Data/Synthetic-Malayalam/Image/" + image_id
        image = cv2.imread(image_path)
        
        i = 0
        #for row_id, row in tqdm(image_data.iterrows(), leave=False, total=len(image_data)):
        for row_id, row in image_data.iterrows():
            coords = row[coords_cols].values
            coords = coords.reshape(4, 1, 2).astype(np.float32)
            
            if method==1:
                x, y, w, h = cv2.boundingRect(coords)
                crop = image[y:y+h, x:x+w]
            elif method==2:
                raise NotImplementedError()
            
            crop = cv2.resize(crop, crop_size)
            crop_path = f"{data_dir}/{row['text']}_{row['folder_id']}_{row['file_id']}_{i}.jpg"
            cv2.imwrite(crop_path, crop)
            i += 1

In [None]:
save_crops(train, train_dir)
save_crops(val, val_dir)
save_crops(test, test_dir)

# 2.4 Telugu

Dataset Source: https://drive.google.com/drive/u/0/folders/1hnNxuHbBBZrrI7Ee6FePTsUfW97qrJAS
(Telugu 1-30)

## 2.4.1 Merge Annotations 

In [None]:
ann_dir = "Data/Synthetic-Telugu/Annotation/{}"

In [None]:
file_path = "Data/Annotations/Annotations-Telugu.csv"
cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text', 'folder_id', 'file_id']
ann_df = pd.DataFrame(columns=cols)
ann_df.to_csv(file_path, index=False)
for i in tqdm(range(1, 30+1)):
    file_names = os.listdir(ann_dir.format(i))
    for file_name in tqdm(file_names, leave=False):
        local_file_path = os.path.join(ann_dir.format(i), file_name)
        df = pd.read_csv(local_file_path, sep=' ', header=None)
        df.columns = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text']
        df['folder_id'] = i
        df['file_id'] = file_name.split(".")[0]
        df.to_csv(file_path, index=False, header=None, mode='a')
        
ann_df = pd.read_csv(file_path)
print(ann_df.shape)
ann_df.head()

In [None]:
cols = ['folder_id', 'file_id', 'text','x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
ann_df = ann_df[cols]
ann_df.to_csv("Data/Annotations-Telugu.csv", index=False)

## 2.4.2 Filter improper data points

In [None]:
ann_df = pd.read_csv("Data/Annotations/Annotations-Telugu.csv")
characters_df = pd.read_csv("Data/Characters/Characters-Telugu.csv")
glyphs = list(characters_df['Glyph'].unique())
consonants = list(characters_df['Consonant'].unique())
vowels = characters_df[characters_df['Consonant']=="-"]['Character'].values.tolist()
characters = glyphs + consonants + vowels
print(len(characters))

In [None]:
def correct_text(text):

    corrected_text = "".join([character for character in text if character in characters])
    
    return corrected_text

In [None]:
ann_df['corrected_text'] = ann_df['text'].progress_apply(correct_text)
ann_df['num_removals'] = ann_df['text'].str.len() - ann_df['corrected_text'].str.len()
ann_df['image_id'] = ann_df['folder_id'].astype('str') + "/" + ann_df['file_id'].astype('str') + ".jpg"
coords_cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4']
ann_df[coords_cols] = ann_df[coords_cols].clip(lower=0)
ann_df.head()

In [None]:
print((ann_df['num_removals']>0).sum(), len(ann_df))

In [None]:
print(len(ann_df))
ann_df = ann_df[ann_df['num_removals']==0]
ann_df = ann_df.drop(columns=['corrected_text', 'num_removals'])
print(len(ann_df))

## 2.4.3 Split dataset

In [None]:
ann_df.groupby('folder_id').size()

In [None]:
train = ann_df[ann_df['folder_id'] <= 28]
print(train.shape)
train = train.sample(n=500000, random_state=0)
print(train.shape)

val = ann_df[ann_df['folder_id'] == 29]
print(val.shape)
val = val.sample(n=5000, random_state=0)
print(val.shape)

test = ann_df[ann_df['folder_id'] == 30]
print(test.shape)
test = test.sample(n=5000, random_state=0)
print(test.shape)

In [None]:
train_dir = "Data/Telugu/Train"
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)

val_dir = "Data/Telugu/Val"
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)
os.makedirs(val_dir)

test_dir = "Data/Telugu/Test"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)

In [None]:
def save_crops(data, data_dir, method=1):
    """
    Parameters
    ----------
    method: int
        1 - bounding rect
        2 - min area rect
    """
    coords_cols = ['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
    crop_size = (200, 50)
    for image_id, image_data in tqdm(data.groupby('image_id')):
        image_path = "Data/Synthetic-Telugu/Image/" + image_id
        image = cv2.imread(image_path)
        
        i = 0
        #for row_id, row in tqdm(image_data.iterrows(), leave=False, total=len(image_data)):
        for row_id, row in image_data.iterrows():
            coords = row[coords_cols].values
            coords = coords.reshape(4, 1, 2).astype(np.float32)
            
            if method==1:
                x, y, w, h = cv2.boundingRect(coords)
                crop = image[y:y+h, x:x+w]
            elif method==2:
                raise NotImplementedError()
            
            crop = cv2.resize(crop, crop_size)
            crop_path = f"{data_dir}/{row['text']}_{row['folder_id']}_{row['file_id']}_{i}.jpg"
            cv2.imwrite(crop_path, crop)
            i += 1

In [None]:
save_crops(train, train_dir)
save_crops(val, val_dir)
save_crops(test, test_dir)

# 2.5 Punjabi

Dataset Source: https://drive.google.com/drive/u/0/folders/1hnNxuHbBBZrrI7Ee6FePTsUfW97qrJAS
(Punjabi 1-30)

## 2.5.1 Merge Annotations 

In [None]:
ann_dir = "Data/Synthetic-Punjabi/Annotation/{}"

In [None]:
file_path = "Data/Annotations/Annotations-Punjabi.csv"
cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text', 'folder_id', 'file_id']
ann_df = pd.DataFrame(columns=cols)
ann_df.to_csv(file_path, index=False)
for i in tqdm(range(1, 30+1)):
    file_names = os.listdir(ann_dir.format(i))
    for file_name in tqdm(file_names, leave=False):
        local_file_path = os.path.join(ann_dir.format(i), file_name)
        df = pd.read_csv(local_file_path, sep=' ', header=None)
        df.columns = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4', 'text']
        df['folder_id'] = i
        df['file_id'] = file_name.split(".")[0]
        df.to_csv(file_path, index=False, header=None, mode='a')
        
ann_df = pd.read_csv(file_path)
print(ann_df.shape)
ann_df.head()

In [None]:
cols = ['folder_id', 'file_id', 'text','x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
ann_df = ann_df[cols]
ann_df.to_csv("Data/Annotations-Punjabi.csv", index=False)

## 2.5.2 Filter improper data points

In [None]:
ann_df = pd.read_csv("Data/Annotations/Annotations-Punjabi.csv")
characters_df = pd.read_csv("Data/Characters/Characters-Punjabi.csv")
glyphs = list(characters_df['Glyph'].unique())
consonants = list(characters_df['Consonant'].unique())
vowels = characters_df[characters_df['Consonant']=="-"]['Character'].values.tolist()
characters = glyphs + consonants + vowels
print(len(characters))

In [None]:
def correct_text(text):

    corrected_text = "".join([character for character in text if character in characters])
    
    return corrected_text

In [None]:
ann_df['corrected_text'] = ann_df['text'].progress_apply(correct_text)
ann_df['num_removals'] = ann_df['text'].str.len() - ann_df['corrected_text'].str.len()
ann_df['image_id'] = ann_df['folder_id'].astype('str') + "/" + ann_df['file_id'].astype('str') + ".jpg"
coords_cols = ['x1', 'x2', 'x3', 'x4', 'y1', 'y2', 'y3', 'y4']
ann_df[coords_cols] = ann_df[coords_cols].clip(lower=0)
ann_df.head()

In [None]:
print((ann_df['num_removals']>0).sum(), len(ann_df))

In [None]:
print(len(ann_df))
ann_df = ann_df[ann_df['num_removals']==0]
ann_df = ann_df.drop(columns=['corrected_text', 'num_removals'])
print(len(ann_df))

## 2.5.3 Split dataset

In [None]:
ann_df.groupby('folder_id').size()

In [None]:
train = ann_df[ann_df['folder_id'] <= 28]
print(train.shape)
train = train.sample(n=500000, random_state=0)
print(train.shape)

val = ann_df[ann_df['folder_id'] == 29]
print(val.shape)
val = val.sample(n=5000, random_state=0)
print(val.shape)

test = ann_df[ann_df['folder_id'] == 30]
print(test.shape)
test = test.sample(n=5000, random_state=0)
print(test.shape)

In [None]:
train_dir = "Data/Punjabi/Train"
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)

val_dir = "Data/Punjabi/Val"
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)
os.makedirs(val_dir)

test_dir = "Data/Punjabi/Test"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)

In [None]:
def save_crops(data, data_dir, method=1):
    """
    Parameters
    ----------
    method: int
        1 - bounding rect
        2 - min area rect
    """
    coords_cols = ['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
    crop_size = (200, 50)
    for image_id, image_data in tqdm(data.groupby('image_id')):
        image_path = "Data/Synthetic-Punjabi/Image/" + image_id
        image = cv2.imread(image_path)
        
        i = 0
        #for row_id, row in tqdm(image_data.iterrows(), leave=False, total=len(image_data)):
        for row_id, row in image_data.iterrows():
            coords = row[coords_cols].values
            coords = coords.reshape(4, 1, 2).astype(np.float32)
            
            if method==1:
                x, y, w, h = cv2.boundingRect(coords)
                crop = image[y:y+h, x:x+w]
            elif method==2:
                raise NotImplementedError()
            
            crop = cv2.resize(crop, crop_size)
            crop_path = f"{data_dir}/{row['text']}_{row['folder_id']}_{row['file_id']}_{i}.jpg"
            cv2.imwrite(crop_path, crop)
            i += 1

In [None]:
save_crops(train, train_dir)
save_crops(val, val_dir)
save_crops(test, test_dir)

In [None]:
"""
Error: 
10/915.jpg 213646
14/2061.jpg 279943
21/309.jpg 422288
21/309.jpg 422290
21/309.jpg 422289
22/3198.jpg 440716
22/3198.jpg 440715
22/3198.jpg 440714
24/2489.jpg 478702
8/2646.jpg 156305
8/2646.jpg 156304
"""