In [None]:
from transformers import NougatProcessor, VisionEncoderDecoderModel
import torch
import pandas as pd
from pdf2image import convert_from_path
from time import time 

processor = NougatProcessor.from_pretrained("facebook/nougat-base")
model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# prepare PDF image for the model
ocr_df = pd.read_excel('ocr_df.xlsx', index_col=0)

start_time = time()
for index, row in ocr_df.iterrows():
    directory = 'test_files'
    pdf_text = ''
    images = convert_from_path(f'{directory}/{row["item_filename"]}.pdf')
    for image in images:
        pixel_values = processor(image, return_tensors="pt").pixel_values
        
        # generate transcription (here we only generate 30 tokens)
        outputs = model.generate(
            pixel_values.to(device),
            min_length=1,
            max_new_tokens=8000,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
        )
        
        sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        pdf_text += processor.post_process_generation(sequence, fix_markdown=False)
    print(index)
    ocr_df.at[index, f'nougat'] = pdf_text

print(time() - start_time)
# Save to ocr_df excel
ocr_df.to_excel('ocr_df.xlsx')