#### Pass each text and prompt to the LLM over API and store the returned json

In [1]:
import json, tqdm
import os
from pathlib import Path
from openai import OpenAI

# 1. Setup OpenRouter Client
# Assumes you have your key in ~/.keys/openrouter
key_path = Path.home() / ".keys" / "openrouter"
api_key = key_path.read_text().strip()

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
)


def analyze_paper(prompt_text, cleantext, model="google/gemini-2.5-flash"):
    """
    Sends the cleaned paper text to OpenRouter for analysis using a custom prompt.
    """
    
    # Stitch the prompt and the paper text together
    # We add a separator to help the model distinguish instructions from data
    full_user_content = f"{prompt_text}\n\n---\n\n{cleantext}"

    messages = [
        {
            "role": "system",
            # A generic system prompt is usually sufficient since specific instructions 
            # are now in your 'prompt_text'
            "content": "You are an expert researcher in NLP and Machine Learning."
        },
        {
            "role": "user",
            "content": full_user_content
        }
    ]

    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            response_format={"type": "json_object"}, 
            temperature=0.1, 
        )
        
        content = response.choices[0].message.content
        return json.loads(content)

    except Exception as e:
        print(f"LLM Error: {e}")
        return {"has_sentiment_modelling": "Error", "error_msg": str(e)}


In [7]:
input_path = "acl_sapapers.jsonl"
output_path = "analyzed_papers.jsonl"
prompt = Path("analysis_prompt.txt").read_text().strip()

# Read your processed OCR file
with open(input_path, "r", encoding="utf-8") as f:
    papers = [json.loads(line) for line in f]

if Path(output_path).exists():  
    with open(output_path, "r", encoding="utf-8") as f:
        completed = [json.loads(line) for line in f]
else:
    completed = [{}]

completed_urls = [c.get("url") for c in completed if c.get("analysis") is not None]
print("Docs, was", len(papers))
papers = [d for d in papers if d["url"] not in completed_urls]
print("Docs, remaining", len(papers))

# Process each paper
for paper in tqdm.tqdm(papers):
    # Skip if OCR failed
    if paper.get("status") != "success":
        continue
        
    print(f"Analyzing: {paper.get('bibkey', 'Unknown')}...")
    
    # Call the LLM
    # Note: 'cleantext' comes from your previous OCR step
    result = analyze_paper(prompt, paper["cleantext"])
    print(result)
    # Merge result back into the paper object
    paper["analysis"] = result
    
    # Save incrementally (append mode)
    with open(output_path, "a", encoding="utf-8") as f_out:
        f_out.write(json.dumps(paper, ensure_ascii=False) + "\n")

Docs, was 1138
Docs, remaining 0


0it [00:00, ?it/s]


In [5]:
# Inspect
with open(output_path, "r", encoding="utf-8") as f:
    completed = [json.loads(line) for line in f]
for key, value in completed[-3]["analysis"].items():
    value = str(value)
    print("\n",key,">>\n", value[:300])
    if len(value) > 200: print ("---",value[-300:])


 has_sentiment_modelling >>
 Yes

 modelling_categories >>
 ['FinetuningEncoder']

 other_method_categories >>
 ['Other']

 notes >>
 The paper proposes an Information Bottleneck-based Gradient (IBG) explanation framework for Aspect-based Sentiment Analysis (ABSA). It integrates this framework with existing ABSA models, primarily BERT-SPC and DualGCN-BERT, which are fine-tuned BERT-like encoders. The core idea is to learn a low-di
--- with existing ABSA models, primarily BERT-SPC and DualGCN-BERT, which are fine-tuned BERT-like encoders. The core idea is to learn a low-dimensional intrinsic representation of word embeddings via an information bottleneck to improve both performance and interpretability of sentiment classification.
