In [3]:
import os

# Get a list of papers with citations
def get_paper_ids_with_citations_json(papers_dir='papers'):
    """Return a list of paper IDs that have a 'citations.json' file."""
    paper_ids_with_citations = []
    
    # Check if papers_dir exists
    if not os.path.exists(papers_dir):
        print(f"Directory {papers_dir} not found")
        return []
    
    # List all subdirectories (paper IDs)
    for paper_id in os.listdir(papers_dir):
        paper_dir = os.path.join(papers_dir, paper_id)
        
        # Check if it's a directory
        if os.path.isdir(paper_dir):
            # Check specifically for 'citations.json' file
            citations_file = os.path.join(paper_dir, 'citations.json')
            if os.path.exists(citations_file):
                paper_ids_with_citations.append(paper_id)
    
    print(f"Found {len(paper_ids_with_citations)} papers with 'citations.json' files")
    return paper_ids_with_citations

ids = get_paper_ids_with_citations_json()

Found 4076 papers with 'citations.json' files


In [4]:
liz_papers = ids[:1359]
gaven_papers = ids[1359:2718]
dawson_papers= ids[2718:]

In [None]:
import ollama
import sys
import time

async def process_papers():
    client = ollama.AsyncClient()
    total_papers = len(liz_papers)
    start_time = time.time()
    
    print(f"Starting processing of {total_papers} papers...")
    
    for i, id in enumerate(liz_papers):
        # Process the current paper
        replacedcite = open(f"papers/{id}/replacedcite.tex", "r").read()
        blankcount = replacedcite.count('____')
        
        prompt = f'''
        You are an expert in computer science citation prediction.

        TASK DEFINITION:
        - Below you will find text containing {blankcount} placeholders marked as '____'
        - Replace EACH placeholder with predicted paper citations in the specified format
        - Return ONLY the text with replacements - nothing else

        CITATION FORMAT:
        - Replace each '____' with: **Author(s) Last Name(s), "Full Paper Title"**
        - For multiple papers: **Author(s), "Paper Title 1"**__**Author2(s), "Paper Title 2"**
        - Include full titles in quotes, NOT publication years

        === INPUT TEXT STARTS BELOW THIS LINE (DO NOT INCLUDE THIS LINE IN OUTPUT) ===
        {replacedcite}
        === INPUT TEXT ENDS ABOVE THIS LINE (DO NOT INCLUDE THIS LINE IN OUTPUT) ===

        >>CRITICAL<< YOUR RESPONSE MUST CONTAIN:
        1. ONLY the original text with placeholders replaced
        2. NO introduction, explanation, or conclusion
        3. NO phrases like "Here is the text" or "I've replaced"
        4. NOTHING before the first character or after the last character of the original text

        Return EXACTLY the input text with each '____' replaced with appropriate citations.
        '''
        
        response = await client.generate('llama3.1:8b', prompt)
        with open(f"papers/{id}/response.txt", 'w') as f:
            f.write(response['response'])
        
        # Calculate progress stats
        elapsed = time.time() - start_time
        percent_done = (i + 1) / total_papers * 100
        papers_per_sec = (i + 1) / elapsed if elapsed > 0 else 0
        remaining = (total_papers - (i + 1)) / papers_per_sec if papers_per_sec > 0 else 0
        
        # Create progress bar (50 characters wide)
        bar_length = 50
        filled_length = int(bar_length * percent_done / 100)
        bar = '█' * filled_length + '░' * (bar_length - filled_length)
        
        # Update progress display
        status = f"\r[{bar}] {percent_done:.1f}% ({i+1}/{total_papers}) | {papers_per_sec:.2f} papers/sec | {remaining/60:.1f} min left"
        sys.stdout.write(status)
        sys.stdout.flush()
    
    # Final newline after completion
    sys.stdout.write('\n')
    print(f"✅ Completed processing {total_papers} papers in {(time.time() - start_time)/60:.1f} minutes!")

# Run the processing
await process_papers()

Starting processing of 1359 papers...
[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.2% (3/1359) | 0.04 papers/sec | 531.7 min left