In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

In [6]:
# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5_ocr_model')
tokenizer = T5Tokenizer.from_pretrained('t5_ocr_tokenizer')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the test data
test_data = pd.read_csv('merged_datatest.csv')  # Update path as needed

In [7]:
# Define the test dataset class
class TestDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        input_text = str(self.dataframe.iloc[index]['Merged_Info'])  # Ensure column name matches your dataset
        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze()
        }

# Create a DataLoader for the test data
test_dataset = TestDataset(test_data, tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Function to predict using DataLoader with tqdm
def predict(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        # Wrap the data loader with tqdm to show progress
        for batch in tqdm(data_loader, desc="Predicting", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=5,
                early_stopping=True
            )
            predicted_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            predictions.extend(predicted_texts)
    return predictions

In [8]:
# Predict entity values
test_predictions = predict(model, test_loader, device)
test_data['predicted_entity_values'] = test_predictions

# Save the results to a new CSV file
test_data.to_csv('test_with_predictions.csv', index=False)
print("Predictions saved to 'test_with_predictions.csv'.")

                                                    

OutOfMemoryError: CUDA out of memory. Tried to allocate 120.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 61.50 MiB is free. Process 227914 has 10.94 GiB memory in use. Process 228006 has 6.76 GiB memory in use. Process 228099 has 970.00 MiB memory in use. Process 232715 has 970.00 MiB memory in use. Including non-PyTorch memory, this process has 3.98 GiB memory in use. Of the allocated memory 2.26 GiB is allocated by PyTorch, and 398.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)