### OCR
with Mistral

In [21]:
import os, sys, json, tqdm
from pathlib import Path
import pandas as pd
from mistralai import Mistral

# Initialize the client
api_key = (Path.home() / ".keys" / "mistral").read_text().strip()
client = Mistral(api_key=api_key)

def ocr_pdf_from_url(pdf_url: str):
    """
    Performs OCR on a remote PDF using Mistral's API.
    """
    response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": pdf_url
        },
        include_image_base64=False  # Set to True if you want to extract images
    )
    
    # The response contains 'pages' with structured markdown for each page
    return response




In [22]:
# Demo
if False:
    url = "https://aclanthology.org/2022.coling-1.589.pdf"  # Attention Is All You Need paper
    ocr_result = ocr_pdf_from_url(url)
    all_pages = [p.markdown for p in ocr_result.pages]
    pages_md = "\n".join(all_pages).split("# References")[0].strip()
    Path("attention.md").write_text(pages_md)


Now the entire acl list of papers

In [25]:
def process_and_save(docs, output_path):
    """The docs list will be checked here """  
    if Path(output_path).exists():  
        with open(output_path, "r", encoding="utf-8") as f:
            completed = [json.loads(line) for line in f]
    else:
        completed = [{}]

    completed_urls = [c.get("url") for c in completed if c.get("status") == "success"]
    print("Docs, was", len(docs))
    docs = [d for d in docs if d["url"] not in completed_urls]
    print("Docs, remaining", len(docs))

    for doc in tqdm.tqdm(docs):     
        try:
            # print("Now processing",doc.get("bibkey"))
            ocr_response = ocr_pdf_from_url(doc["url"])
            
            pages_md = [page.markdown for page in ocr_response.pages]
            fulltext = "\n\n".join([p for p in pages_md])
            cleantext = fulltext.split("# References")[0].strip()
            ocr_data = dict(
                fulltext = fulltext,
                cleantext = cleantext,
                pages_md = pages_md
            )
            doc = {**doc, **ocr_data}
            
            doc["status"] = "success"

        except Exception as e:
            print(str(e)[:50])
            doc["status"] = "failed"
        
        # OPEN, WRITE, CLOSE for every single document
        with open(output_path, 'a', encoding='utf-8') as f:
            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
 

source = pd.read_parquet("sapapers.parquet").to_dict('records')

process_and_save(source, "acl_sapapers.jsonl")

Docs, was 1130
Docs, remaining 0


0it [00:00, ?it/s]


In [31]:
# Inspect
with open("acl_sapapers.jsonl", "r", encoding="utf-8") as f:
    completed = [json.loads(line) for line in f]
for key, value in completed[-3].items():
    value = str(value)
    print("\n\n",key,">>\n", value[:300])
    if len(value) > 200: print ("---",value[-300:])



 semeval >>
 False


 id >>
 897


 bibkey >>
 cheng-etal-2024-learning


 year >>
 2024


 url >>
 https://aclanthology.org/2024.lrec-main.897.pdf


 fulltext >>
 # Learning Intrinsic Dimension via Information Bottleneck for Explainable Aspect-based Sentiment Analysis

Zhenxiao Cheng $^{1}$ , Jie Zhou $^{1,*}$ , Wen Wu $^{1}$ , Qin Chen $^{1}$ , Liang He $^{1}$

$^{1}$  School of Computer Science and Technology, East China Normal University, Shanghai, China


--- istics, pages 2152–2161.

Jie Zhou, Qi Zhang, Qin Chen, Liang He, and Xuan-Jing Huang. 2022. A multi-format transfer learning model for event argument extraction via variational information bottleneck. In Proceedings of the 29th International Conference on Computational Linguistics, pages 1990–2000.


 cleantext >>
 # Learning Intrinsic Dimension via Information Bottleneck for Explainable Aspect-based Sentiment Analysis

Zhenxiao Cheng $^{1}$ , Jie Zhou $^{1,*}$ , Wen Wu $^{1}$ , Qin Chen $^{1}$ , Liang He $^{1}$

$^{1}$  S