### Basic Imports

In [None]:
import pandas as pd
import re
import warnings
import torch
import logging
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
from tqdm import tqdm

#### Pre-Process Traning Dataset

Load the CSV file

In [None]:
df = pd.read_csv('/dataset/train.csv')

Check if the entity_value matches the desired pattern

In [None]:
def is_valid_entity_value(value):
    pattern = r'^\d+(\.\d+)?\s+[a-zA-Z]+$'
    return bool(re.match(pattern, str(value)))

Filter the DataFrame

df_filtered = df[df['entity_value'].apply(is_valid_entity_value)]

Save  changes to a new CSV file

In [None]:
df_filtered.to_csv('/dataset/train.csv', index=False)

# Easy-OCR Feature Extraction


Imports

In [None]:
import cv2
import easyocr
from PIL import Image
import requests

 Setup logging

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(filename='image_processing.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info('Starting image processing.')

 Check if GPU is available for EasyOCR

In [None]:
use_gpu = torch.cuda.is_available()
if use_gpu:
    logging.info("CUDA is available. Using GPU for OCR.")
else:
    logging.info("CUDA is not available. Using CPU for OCR.")

Ensure EasyOCR model is downloaded

In [None]:
reader = easyocr.Reader(['en'], gpu=use_gpu)

Function to load image from URL

In [None]:
def load_image_from_url(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)


Function to process a single image and extract text

In [None]:
def process_image(url):
    try:
        # Load the image from URL
        image = load_image_from_url(url)
        if image is None:
            logging.warning(f"Failed to load image: {url}")
            return ""

        # downscale the image for faster processing
        image = cv2.resize(image, (600, 600), interpolation=cv2.INTER_AREA)

        # Perform OCR
        results = reader.readtext(image)

        # Extract recognized text
        recognized_text = ' '.join([result[1] for result in results])
        logging.info(f"Processed image: {url} - Extracted text: {recognized_text}")
        return recognized_text

    except Exception as e:
        logging.error(f"Error processing image {url}: {str(e)}")
        return ""

Load Training CSV file

In [None]:
input_csv = '/dataset/train.csv'  
logging.info(f"Loading CSV file: {input_csv}")
df = pd.read_csv(input_csv)

Prepare image URLs

In [None]:
image_urls = df['image_link'].tolist()

Process images in parallel with progress tracking

In [None]:
outputs = [""] * len(image_urls)
batch_size = 10  # Adjust batch size as needed
checkpoint_interval = 1000  # Save after every 1000 records
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_image, url, idx): idx for idx, url in enumerate(image_urls)}
    for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Processing images")):
        idx, result = future.result()
        outputs[idx] = result

Add the output to DataFrame

In [None]:
df['output'] = outputs

Save the updated DataFrame to a new CSV file using a semicolon as delimiter(Comma can interfare with csv files)

In [None]:
output_csv = '/dataset/Train_output.csv'  # Replace with the desired output path
logging.info(f"Saving updated CSV file: {output_csv}")
df.to_csv(output_csv, index=False, sep='~')
logging.info("Process completed successfully.")

#### Now Run OCR for Test dataset

Load Training CSV file

In [None]:
input_csv = '/dataset/test.csv'  
logging.info(f"Loading CSV file: {input_csv}")
df = pd.read_csv(input_csv)

Prepare image URLs

In [None]:
image_urls = df['image_link'].tolist()

Process images in parallel with progress tracking

In [None]:
outputs = [""] * len(image_urls)
batch_size = 10  # Adjust batch size as needed
checkpoint_interval = 1000  # Save after every 1000 records
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_image, url, idx): idx for idx, url in enumerate(image_urls)}
    for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Processing images")):
        idx, result = future.result()
        outputs[idx] = result

Add the output to DataFrame

In [None]:
df['output'] = outputs

Save the updated DataFrame to a new CSV file using a semicolon as delimiter(Comma can interfare with csv files)

In [None]:
output_csv = '/dataset/Test_output.csv'  # Replace with the desired output path
logging.info(f"Saving updated CSV file: {output_csv}")
df.to_csv(output_csv, index=False, sep='~')
logging.info("Process completed successfully.")

# We now have Train_output.csv with OCR data that we will use to train our model on.

Imports

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback

#### Unit conversion dictionary (Expansion of Constants.py)

In [None]:
unit_conversions = {
    'cm': 'centimetre', 'centimeter': 'centimetre',
    'ft': 'foot', 'foot': 'foot', "'": 'foot', 'feet': 'foot',
    'in': 'inch', 'inch': 'inch', '"': 'inch',
    'm': 'metre', 'meter': 'metre',
    'mm': 'millimetre', 'millimeter': 'millimetre',
    'yd': 'yard', 'yard': 'yard',
    'g': 'gram', 'gram': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'mcg': 'microgram', 'microgram': 'microgram',
    'mg': 'milligram', 'milligram': 'milligram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'lbs': 'pound', 'pound': 'pound',
    't': 'ton', 'ton': 'ton', 'short ton': 'ton', 'long ton': 'ton',
    'kV': 'kilovolt', 'kilovolt': 'kilovolt',
    'mV': 'millivolt', 'millivolt': 'millivolt',
    'V': 'volt', 'volt': 'volt',
    'kW': 'kilowatt', 'kilowatt': 'kilowatt',
    'W': 'watt', 'watt': 'watt',
    'cL': 'centilitre', 'centiliter': 'centilitre',
    'ft³': 'cubic foot', 'cu ft': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cu in': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'dL': 'decilitre', 'deciliter': 'decilitre',
    'fl oz': 'fluid ounce', 'oz fl': 'fluid ounce', 'fluid ounce': 'fluid ounce',
    'gal': 'gallon', 'imperial gallon': 'gallon', 'gallon': 'gallon',
    'imp gal': 'imperial gallon', 'imperial gallon': 'imperial gallon',
    'L': 'litre', 'liter': 'litre', 'litre': 'litre',
    'µL': 'microlitre', 'mcL': 'microlitre', 'microliter': 'microlitre',
    'mL': 'millilitre', 'ml': 'millilitre', 'milliliter': 'millilitre',
    'pint': 'pint',
    'qt': 'quart', 'quart': 'quart', 'ter': 'tre'
}

Load data from CSV

In [None]:
data = pd.read_csv('/dataset/Train_output.cs', delimiter='~')

Prepare input and output texts

In [None]:
input_texts = [f"extract entity_value: {row['entity_name']} {row['output']}" for _, row in data.iterrows()]
output_texts = [row['entity_value'] for _, row in data.iterrows()]

Load the T5-small model and tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

Tokenize data

In [None]:
train_encodings = tokenizer(input_texts, truncation=True, padding=True, max_length=128)  # Consider reducing max_length if possible
labels = tokenizer(output_texts, truncation=True, padding=True, max_length=128)

Create Dataset class

In [None]:
class Seq2SeqDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])


Custom callback for status tracking

In [None]:
class StatusCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        logs = logs or {}
        print(f"Step: {state.global_step}, Loss: {logs.get('loss', 'N/A')}, Learning Rate: {logs.get('learning_rate', 'N/A')}")


Set up training arguments and trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Reduced batch size to handle larger datasets and faster processing
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    fp16=True,  
    dataloader_num_workers=2,  
    save_steps=1000,  # Saving model checkpoint every 1000 steps
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    callbacks=[StatusCallback()]  # Add the custom callback here
)

## Train the model

In [None]:
trainer.train()

Save the trained model

In [None]:
model.save_pretrained('./fine_tuned_t5_model')
tokenizer.save_pretrained('./fine_tuned_t5_model')

# Saving Predictions to Output FIle

### Use Test_Output for Predictions

Load data from CSV

In [None]:
data = pd.read_csv('/dataset/Test_output.cs', delimiter='~')

Load the fine-tuned model and tokenizer

In [None]:
model_path = './fine_tuned_t5_model'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.to('cuda')  # Move model to GPU

Custom Dataset class for efficient batching

In [None]:
class EntityDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        entity_name = row['entity_name']
        ocr_text = row['output']
        text = f"extract entity_value: {entity_name} {ocr_text}"
        return text

Create the dataset and dataloader

In [None]:
dataset = EntityDataset(data)
batch_size = 64  
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

predictions = []
for batch in tqdm(dataloader, desc="Predicting"):
    inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to('cuda') for key, val in inputs.items()}  
    outputs = model.generate(**inputs)
    batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # Post-process predictions
    for predicted_value in batch_predictions:
        if predicted_value.strip() == "":
            predictions.append("")  # Return an empty string if no value was found
            continue
        
        try:
            value, unit = predicted_value.split()
            normalized_unit = unit_conversions.get(unit, unit)
            predictions.append(f"{value} {normalized_unit}")
        except ValueError:
            # If split fails, it means the output format is not as expected
            predictions.append("")

Add predictions to the dataframe

In [None]:
data['prediction'] = predictions

Create the output CSV with 'index' and 'prediction'

In [None]:
output_data = data[['index', 'prediction']]
output_data.to_csv('/dataset/OUTPUT.csv', index=False)

## Run Sanity check using src/sanity.py

In [None]:
!python sanity.py --test_filename ../dataset/test.csv --output_filename ../dataset/test_out.csv