In [7]:
import os
from exa_py import Exa
from dotenv import load_dotenv
load_dotenv()

os.environ["EXA_API_KEY"] = os.getenv('EXA_API_KEY')
exa = Exa(api_key = os.getenv('EXA_API_KEY'))

result = exa.search_and_contents(
    "Find me pdfs of recent major algorithmic breakthroughs that optimize an existing algorithm",
    text = { "max_characters": 1000 },
    category = "research paper",
    num_results = 10,
    start_published_date = "2015-01-01T08:00:00.000Z",
    end_published_date = "2025-05-25T06:59:59.999Z"
)
print(result)




Title: Faster $(Δ+ 1)$-Edge Coloring: Breaking the $m \sqrt{n}$ Time Barrier
URL: https://arxiv.org/abs/2405.15449
ID: https://arxiv.org/abs/2405.15449
Score: 0.4087863266468048
Published Date: 2024-05-24T00:00:00.000Z
Author: Bhattacharya; Sayan; Carmon; Din; Costa; Martín; Solomon; Shay; Zhang; Tianyi
Image: None
Favicon: None
Extras: None
Subpages: None
Text: 
 View PDF 
 HTML (experimental) 
Vizing's theorem states that any $n$-vertex $m$-edge graph of maximum degree $\Delta$ can be {\em edge colored} using at most $\Delta + 1$ different colors [Diskret.~Analiz, '64]. Vizing's original proof is algorithmic and shows that such an edge coloring can be found in $\tilde{O}(mn)$ time. This was subsequently improved to $\tilde O(m\sqrt{n})$, independently by Arjomandi [1982] and by Gabow et al.~[1985].
 In this paper we present an algorithm that computes such an edge coloring in $\tilde O(mn^{1/3})$ time, giving the first polynomial improvement for this fundamental problem in over 40 yea

In [9]:
def download_breakthrough_papers(results):
    results = result.results
    print(results)
    for paper in results:
        url = paper.url
        title = paper.title
        import os
        import requests
        from urllib.parse import urlparse
        
        # Create papers directory if it doesn't exist
        if not os.path.exists('papers'):
            os.makedirs('papers')
            
        try:
            # Download the paper
            response = requests.get(url)
            response.raise_for_status()
            
            # Get filename from URL or use title if URL has no filename
            filename = os.path.basename(urlparse(url).path)
            if not filename or '.' not in filename:
                # Clean title and add .pdf extension
                filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_'))
                filename = filename.strip().replace(' ', '_') + '.pdf'
                
            filepath = os.path.join('papers', filename)
            
            # Save the paper
            with open(filepath, 'wb') as f:
                f.write(response.content)
                
            print(f"Downloaded: {filename}")
                
        except Exception as e:
            print(f"Error downloading {title}: {str(e)}")
download_breakthrough_papers(result)


Downloaded: 1911.03572v1.pdf
Error downloading A novel lossless encoding algorithm for data compression–genomics data as an exemplar: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC11799261/
Downloaded: A_Novel_Data_Compression_Methodology_Focused_on_Power_Quality_Signals_Using_Compressive_Sampling_Matching_Pursuit.pdf
Downloaded: s41598-023-29068-z.pdf
Downloaded: 2304.01106v1.pdf
Downloaded: 2304.07342v2.pdf
Downloaded: Arithmetic_N-gram_an_efficient_data_compression_technique.pdf


In [10]:
import os
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import SystemMessage, HumanMessage
import PyPDF2

# Initialize Claude
os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0)

def extract_paper_details(pdf_path):
    """Extract key details from a PDF paper using Claude"""
    
    # Read PDF content
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            
            # Extract all pages from the PDF
            for i in range(len(reader.pages)):
                text += reader.pages[i].extract_text()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return None

    # Prompt Claude to extract key details
    system_message = SystemMessage(content="""
    You are an expert at analyzing academic papers focused on compression optimization. Extract and summarize the key details including:
    - Title and Authors
    - Main research question/objective
    - Key methodology and compression techniques used
    - Performance metrics and compression ratios achieved
    - Target data types and use cases
    - Hardware/software requirements and constraints
    - Main findings/conclusions
    - Limitations and trade-offs
    - Potential application domains where this compression could be valuable
    - Integration considerations for existing systems
    
    For each paper, identify:
    1. What types of data can this compression method handle?
    2. What are the key performance characteristics (compression ratio, speed, quality)?
    3. What are the computational requirements?
    4. What existing systems could benefit from this compression method?
    5. What would be needed to integrate this into an existing system?
    
    Present the information in a clear, structured format that helps evaluate potential applications.

    Return only the information in the formatted JSON response and nothing else. Format the response as JSON with the following structure:
    {
        "metadata": {
            "title": "",
            "authors": []
        },
        "research": {
            "objective": "",
            "methodology": [],
            "compression_techniques": [],
            "performance_metrics": {
                "compression_ratio": "",
                "speed": "",
                "quality": "",
                "other_metrics": []
            }
        },
        "technical_details": {
            "target_data_types": [],
            "use_cases": [],
            "requirements": {
                "hardware": [],
                "software": [],
                "constraints": []
            }
        },
        "findings": {
            "main_conclusions": [],
            "limitations": [],
            "trade_offs": []
        },
        "applications": {
            "potential_domains": [],
            "integration_considerations": [],
            "beneficiary_systems": []
        },
        "evaluation": {
            "data_types_supported": [],
            "performance_characteristics": {
                "compression_ratio": "",
                "speed": "",
                "quality": ""
            },
            "computational_requirements": [],
            "integration_requirements": []
        }
    }

    Ensure all fields are filled with relevant information from the paper. If any field is not applicable, use an empty array or string as appropriate.
    """)
    
    human_message = HumanMessage(content=f"Extract key details from this paper text: {text}")
    
    try:
        response = llm.invoke([system_message, human_message])
        return response.content
    except Exception as e:
        print(f"Error extracting details from {pdf_path}: {str(e)}")
        return None
# Process each paper in the papers directory
papers_dir = 'papers'
for filename in os.listdir(papers_dir):
    if filename.endswith('.pdf'):
        print(f"\nProcessing {filename}...")
        filepath = os.path.join(papers_dir, filename)
        
        # Create a folder for this paper (using filename without extension)
        paper_name = os.path.splitext(filename)[0]
        paper_dir = os.path.join('paper_analysis', paper_name)
        os.makedirs(paper_dir, exist_ok=True)
        
        # Copy the original paper to the new directory
        import shutil
        shutil.copy2(filepath, os.path.join(paper_dir, filename))
        
        # Extract and save the details
        details = extract_paper_details(filepath)
        if details:
            # Save the analysis as JSON
            import json
            analysis_path = os.path.join(paper_dir, 'analysis.json')
            with open(analysis_path, 'w', encoding='utf-8') as f:
                json.dump(details, f, indent=2)
            print(f"Analysis saved to {analysis_path}")
        print("-" * 80)



Processing s41598-023-29068-z.pdf...
Analysis saved to paper_analysis/s41598-023-29068-z/analysis.json
--------------------------------------------------------------------------------

Processing 2304.07342v2.pdf...
Analysis saved to paper_analysis/2304.07342v2/analysis.json
--------------------------------------------------------------------------------

Processing 2304.01106v1.pdf...
Analysis saved to paper_analysis/2304.01106v1/analysis.json
--------------------------------------------------------------------------------

Processing Arithmetic_N-gram_an_efficient_data_compression_technique.pdf...
Error reading PDF papers/Arithmetic_N-gram_an_efficient_data_compression_technique.pdf: EOF marker not found
--------------------------------------------------------------------------------

Processing A_Novel_Data_Compression_Methodology_Focused_on_Power_Quality_Signals_Using_Compressive_Sampling_Matching_Pursuit.pdf...
Error reading PDF papers/A_Novel_Data_Compression_Methodology_Focused

In [21]:
import os
import json

def generate_search_query(analysis_path):
    with open(analysis_path, 'r', encoding='utf-8') as f:
        analysis_text = f.read()
    system_message = SystemMessage(content="""
    Generate a search query to find research papers that could benefit from the applications described in the analysis.
    Focus on:
    1. The target data types and use cases
    2. The potential domains and beneficiary systems
    3. The performance characteristics and requirements
    
    Format the query as a natural language search query. For instance: 'research papers about <potential application> that could benefit from <breakthrough technique>'
    """)
    
    human_message = HumanMessage(content=f"Generate a natural language search query based on this analysis: {analysis_text}. Return only the query, no other text. The query should be concise and to the point, and in natural language. ")

    try:
        response = llm.invoke([system_message, human_message])
        return response.content
    except Exception as e:
        print(f"Error generating search query: {str(e)}")
    None
# Process each analysis.json file
papers_dir = 'paper_analysis'
for paper_folder in os.listdir(papers_dir):
    analysis_path = os.path.join(papers_dir, paper_folder, 'analysis.json')
    if os.path.exists(analysis_path):
        print(f"\nPaper: {paper_folder}")
        query = generate_search_query(analysis_path)
        print(f"Search query: {query}")
        
        # Save query to query.txt
        query_path = os.path.join(papers_dir, paper_folder, 'query.txt')
        with open(query_path, 'w', encoding='utf-8') as f:
            f.write(query)
        print(f"Query saved to {query_path}")
    


Paper: 2304.07342v2
Search query: research papers about GPU-accelerated lossless compression for multi-byte data in high-performance computing and deep learning applications that could benefit from LZSS optimization techniques
Query saved to paper_analysis/2304.07342v2/query.txt

Paper: 1911.03572v1
Search query: research papers about general-purpose lossless compression for sequential data that could benefit from neural network modeling and arithmetic coding techniques
Query saved to paper_analysis/1911.03572v1/query.txt

Paper: s41598-023-29068-z
Search query: research papers about image compression techniques for virtual reality, augmented reality, and metaverse applications that could benefit from soft compression algorithms with superior compression ratios
Query saved to paper_analysis/s41598-023-29068-z/query.txt

Paper: 2304.01106v1
Search query: research papers about semantic text compression techniques that could benefit from BERT and Transformer models for preserving meaning

In [31]:
import os

from exa_py import Exa

def search_papers_with_exa(query_path):
    with open(query_path, 'r', encoding='utf-8') as f:
        query = f.read().strip()
    
    # Use exa to search for papers
    try:
        result = exa.search_and_contents(
            query,
            text = { "max_characters": 1000 },
            category = "research paper",
            num_results = 10,
            start_published_date = "2015-01-01T08:00:00.000Z",
            end_published_date = "2025-05-25T06:59:59.999Z"
        )
        print(result)
        return result
    except Exception as e:
        print(f"Error searching with exa: {str(e)}")
        return None

# Process each query.txt file
papers_dir = 'paper_analysis'
for paper_folder in os.listdir(papers_dir):
    query_path = os.path.join(papers_dir, paper_folder, 'query.txt')
    if os.path.exists(query_path):
        print(f"\nSearching for papers related to: {paper_folder}")
        results = search_papers_with_exa(query_path)
        if results:
            print("Search results:")
            print(results)


def download_candidate_papers(results, paper_folder):
    # Create candidates directory if it doesn't exist
    candidates_dir = os.path.join('paper_analysis', paper_folder, 'candidates')
    if not os.path.exists(candidates_dir):
        os.makedirs(candidates_dir)
        
    for paper in results.results:
        url = paper.url
        title = paper.title
        
        try:
            # Download the paper
            import requests
            from urllib.parse import urlparse
            response = requests.get(url)
            response.raise_for_status()
            
            # Get filename from URL or use title if URL has no filename
            filename = os.path.basename(urlparse(url).path)
            if not filename or '.' not in filename:
                # Clean title and add .pdf extension
                filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_'))
                filename = filename.strip().replace(' ', '_') + '.pdf'
                
            filepath = os.path.join(candidates_dir, filename)
            
            # Actually save the downloaded content to file
            with open(filepath, 'wb') as f:
                f.write(response.content)
                
            print(f"Downloaded candidate paper: {filename}")
                
        except Exception as e:
            print(f"Error downloading {title}: {str(e)}")

# Call the function for each paper folder
for paper_folder in os.listdir('paper_analysis'):
    query_path = os.path.join('paper_analysis', paper_folder, 'query.txt')
    if os.path.exists(query_path):
        results = search_papers_with_exa(query_path)
        if results:
            download_candidate_papers(results, paper_folder)



Searching for papers related to: 2304.07342v2
Title: GPULZ: Optimizing LZSS Lossless Compression for Multi-byte Data on Modern GPUs
URL: https://export.arxiv.org/pdf/2304.07342v2.pdf
ID: https://export.arxiv.org/pdf/2304.07342v2.pdf
Score: 0.4447816014289856
Published Date: 2023-06-21T00:00:00.000Z
Author: Boyuan Zhang,Jiannan Tian,Sheng Di,Xiangyao Yu,Martin Swany,Dingwen Tao,Franck Cappello
Image: None
Favicon: None
Extras: None
Subpages: None
Text: Boyuan Zhang bozhan@iu.edu
Jiannan Tian
Sheng Di
Xiaodong Yu
Martin Swany swany@indiana.edu
Dingwen Tao ditao@iu.edu
Franck Cappello cappello@mcs.anl.gov
Boyuan Zhang
Jiannan Tian
XiaodongSheng Di
Yu
Martin Swany
Ding-Wen Tao
Franck Cappello
Indiana University Bloomington
INUSA
Argonne National Laboratory Lemont
Indiana University Bloomington
IN, ILUSA, USA
Argonne National Laboratory Lemont
Indiana University Bloomington
IL, INUSA, USA
Department of Intelligent Systems Engineering, Luddy School of Informatics, Computing, and Engineering