In [16]:
import os
import json
import asyncio
import aiohttp
from typing import Optional, List, Dict, Any

from langchain.llms.base import LLM
from langchain.schema import Generation, LLMResult
from langchain_community.document_loaders import PyMuPDFLoader
from pydantic import Field
from openai import OpenAI
import nest_asyncio
nest_asyncio.apply()

# Corrected AsyncOpenAI class
class AsyncOpenAI:
    def __init__(self, api_key: str, base_url: str):
        self.client = OpenAI(api_key=api_key, base_url=base_url)

    async def __aenter__(self):
        return self

    async def __aexit__(self, *excinfo):
        return None

    async def chat_completion(self, **kwargs):
        return await asyncio.to_thread(self.client.chat.completions.create, **kwargs)


class DeepSeekLLM(LLM):
    api_key: str = Field(..., description="DeepSeek API Key")
    model: str = Field(..., description="DeepSeek Model Name")
    api_url: str = Field(..., description="DeepSeek API URL")
    temperature: float = Field(0, description="Temperature for generation")
    max_tokens: int = Field(8000, description="Maximum tokens for generation")
    request_timeout: int = Field(60, description="Timeout for API requests in seconds")

    @property
    def _llm_type(self) -> str:
        return "deepseek"

    async def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        async with AsyncOpenAI(api_key=self.api_key, base_url=self.api_url) as client:
            try:
                response = await client.chat_completion(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt},
                    ],
                    temperature=self.temperature,
                    max_tokens=self.max_tokens,
                    stream=False,
                    timeout=self.request_timeout,
                )
                return response.choices[0].message.content.strip()
            except Exception as e:
                print(f"Error in LLM call: {str(e)}")
                return f"Error generating summary: {str(e)}"

    async def _agenerate(self, prompts: List[str], stop: Optional[List[str]] = None) -> LLMResult:
        tasks = [self._call(prompt, stop=stop) for prompt in prompts]
        texts = await asyncio.gather(*tasks)
        generations = [[Generation(text=text)] for text in texts]
        return LLMResult(generations=generations)

    def _identifying_params(self) -> Dict[str, Any]:
        return {
            "model": self.model,
            "api_url": self.api_url,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "request_timeout": self.request_timeout,
        }


async def summarize_pdf(pdf_path: str, llm: DeepSeekLLM) -> dict:
    try:
        loader = PyMuPDFLoader(pdf_path)
        documents = loader.load()

        full_text = "\n\n".join([doc.page_content for doc in documents])[:8000]

        summary_prompt = (
            "Please provide a detailed summary (approximately 500 words) of the following scientific article. "
            "Structure the summary as follows:\n\n"
            "1. Key Findings and Conclusions\n"
            "2. Background and Context\n"
            "3. Methodology\n"
            "4. Results and Discussion\n"
            "At the end of the summary, on a new line, please add:\n"
            "Citation: [First Author's Last Name] - [Article Title]\n\n"
            "Use academic language and maintain scientific accuracy. "
            "Include specific details about experimental methods, key data points, and statistical significance where mentioned.\n\n"
            f"Article text:\n{full_text}"
        )

        try:
            summary_output = await llm._call(summary_prompt)
            return {
                "summary": summary_output,
                "file_name": os.path.basename(pdf_path)
            }
        except Exception as e:
            print(f"Error in LLM call: {str(e)}")
            return {
                "summary": f"Error generating summary: {str(e)}",
                "file_name": os.path.basename(pdf_path)
            }

    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")
        return {
            "summary": f"Error processing document: {str(e)}",
            "file_name": os.path.basename(pdf_path)
        }


async def process_pdfs_and_generate_summaries(pdf_dir: str, output_file: str, llm: DeepSeekLLM) -> List[dict]:
    summaries = []
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:  # Process the first PDF (adjust as needed)
        print(f"Processing {pdf_file} ...")
        summary_data = await summarize_pdf(pdf_file, llm)
        if summary_data:
            summaries.append(summary_data)
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(summaries, f, ensure_ascii=False, indent=2)
            print(f"Progress saved to {output_file}")

    return summaries


async def main():
    api_key = "sk-xxx"  # Replace with your key or environment variable

    llm = DeepSeekLLM(
        api_key=api_key,
        model="gpt-4o",  # Or the appropriate model name
        api_url="https://api.gpt.ge/v1/",  # Corrected API URL
        request_timeout=60,
    )

    PDF_DIR = "./pdf/"
    OUTPUT_JSON = "pdf_summaries.json"

    summaries = await process_pdfs_and_generate_summaries(PDF_DIR, OUTPUT_JSON, llm)


if __name__ == "__main__":
    asyncio.run(main())  # This will now work correctly in Jupyter



async def generate_scientific_review(summaries: List[dict], instructions: str, llm: LLM) -> str:
    combined_summary = "\n\n".join([s["summary"] for s in summaries])
    review_prompt = f"{instructions}\n\nCombined Summaries:\n{combined_summary}"

    try:
        review = await llm._call(review_prompt)
        return review
    except Exception as e:
        print(f"Error generating review: {e}")
        return f"Error generating review: {e}"


Processing ./pdf/Garralda-2024-MYC targeting by OMO-103 in soli.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Hauseman-2022-Structure of the MRAS–SHOC2–PP1C.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/DeNicola-2011-Oncogene-induced Nrf2 transcript.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Maldegem-2021-Characterisation of tumour micro.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Stathis-2023-Results of an open-label phase 1b.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Commisso-2013-Macropinocytosis of protein is a.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Jones-2019-SHOC2 phosphatase-dependent RAF dim.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Lavoie-2015-Regulation of RAF protein kinases.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Kuboki-2024-Sotorasib with panitumumab in chem.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Mon

  func_call = functools.partial(ctx.run, func, *args, **kwargs)


Progress saved to pdf_summaries.json
Processing ./pdf/Molina-Arcas-2021-Drugging the undruggable_ ad.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Knudsen-2021-Targeting dual signalling pathway.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Bonsor-2022-Structure of the SHOC2–MRAS–PP1C c.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Chan-2021-PARP inhibitors in melanoma — an exp.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Emery-2009-MEK1 mutations confer resistance to.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Manchado-2016-A combinatorial strategy for tre.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Avraham-2011-Feedback regulation of EGFR signa.pdf ...
Progress saved to pdf_summaries.json
Processing ./pdf/Ryan-2018-Therapeutic strategies to target RAS.pdf ...
Error in LLM call: Request timed out.
Progress saved to pdf_summaries.json


In [27]:
async def main_review(llm: LLM):  # Pass llm as argument
    INPUT_JSON = "pdf_summaries.json"
    try:
        with open(INPUT_JSON, "r", encoding="utf-8") as f:
            summaries = json.load(f)
            for i in summaries:i['summary']=i['summary']+'-ref:'+i['file_name']
    except FileNotFoundError:
        print(f"Error: File '{INPUT_JSON}' not found. Did you run the PDF processing code first?")
        return
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{INPUT_JSON}'.")
        return

    custom_review_instructions = (
        "Give me a long scientific review based on these summaries. reference to each manuscript by (XXX et.al, year), use time from key filename in the end of the json"
        "Focus on the key findings, trends, and any interesting points. "
        "Keep it concise and easy to read."
    )

    review = await generate_scientific_review(summaries, custom_review_instructions, llm)

    with open("scientific_review.txt", "w", encoding="utf-8") as f:
        f.write(review)
    print("Scientific review saved to scientific_review.txt")


if __name__ == "__main__":
    api_key = "sk-xxx"  # Replace with your key or environment variable
    if api_key is None:
        raise ValueError("DEEPSEEK_API_KEY environment variable not set.")

    llm = DeepSeekLLM(
        api_key=api_key,
        model="gemini-1.5-pro-latest",  # Or the appropriate model name
        api_url="https://api.gpt.ge/v1/",  # Corrected API URL
        request_timeout=1000,
    )
    asyncio.run(main_review(llm))  # This will cause an error if llm is NOT defined in this cell


Scientific review saved to scientific_review.txt
