In [8]:
from groq import Groq
import os
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
import requests
from bs4 import BeautifulSoup

In [None]:
def scrape_biorxiv_pdf_urls():
    base_url = "https://www.biorxiv.org"
    pdf_urls = []

    for page_number in range(1, 170): 
        page_url = f"{base_url}/content/early/recent?page={page_number}"
        response = requests.get(page_url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            article_links = soup.find_all("a", class_="highwire-cite-linked-title")
            
            for article_link in article_links:
                article_href = article_link.get("href")
                if article_href:
                    article_url = base_url + article_href
                    article_response = requests.get(article_url)
                    
                    if article_response.status_code == 200:
                        article_soup = BeautifulSoup(article_response.content, "html.parser")
                        pdf_link = article_soup.find("a", class_="article-dl-pdf-link")
                        
                        if pdf_link:
                            pdf_href = pdf_link.get("href")
                            if pdf_href and pdf_href.endswith(".pdf"):
                                full_pdf_url = base_url + pdf_href
                                pdf_urls.append(full_pdf_url)
    
    return pdf_urls

In [None]:
api_key = "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"
groq_client = Groq(api_key=api_key)

def load_pdf_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        reader = PdfReader("temp.pdf")
        text = "".join([page.extract_text() for page in reader.pages])
        return text
    except requests.RequestException as e:
        print(f"Failed to download PDF from {url}: {str(e)}")
        return ""

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = "\n".join([para.get_text() for para in paragraphs])
        return content
    except requests.RequestException as e:
        print(f"Failed to scrape website {url}: {str(e)}")
        return ""

def load_data():

    pdf_urls = [
        "https://www.biorxiv.org/content/10.1101/2020.07.28.224253v1.full.pdf",
        "https://www.cartercenter.org/resources/pdfs/health/ephti/library/lecture_notes/health_extension_trainees/generalpathology.pdf"
    ]

    
    website_urls = [
        "https://en.wikipedia.org/wiki/Pathology",
        "https://www.mcgill.ca/pathology/about/definition#:~:text=Pathology%20is%20a%20branch%20of,the%20whole%20body%20(autopsy)."
    ]

    pdf_urls.extend(scrape_biorxiv_pdf_urls())
    documents_with_metadata = []

    for url in pdf_urls:
        data = load_pdf_from_url(url)
        if data:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
            splits = text_splitter.split_text(data)
            for split in splits:
                documents_with_metadata.append(Document(page_content=split, metadata={"source": url}))


    for url in website_urls:
        data = scrape_website(url)
        if data:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
            splits = text_splitter.split_text(data)
            for split in splits:
                documents_with_metadata.append(Document(page_content=split, metadata={"source": url}))

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


    if not os.path.exists("chromadb"):
        os.makedirs("chromadb")

    vectorstore = Chroma.from_documents(
        documents=documents_with_metadata, 
        embedding=embedding_model, 
        persist_directory="chromadb"
    )

    return vectorstore


vectorstore = load_data()

In [None]:
def query_rag(question):

    template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know; don't try to make up an answer.
    Keep the answer as concise as possible, preferably in three sentences.
    Quote the url the info came from.
    {context}
    Question: {question}
    Answer:
    """


    retriever = vectorstore.as_retriever()
    context_docs = retriever.get_relevant_documents(question)
    context_text = "\n".join([doc.page_content for doc in context_docs])


    prompt = template.format(context=context_text, question=question)


    response = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        model="llama3-8b-8192"
    )


    return response.choices[0].message.content


question = "Finish this sentance, The study of pathology, including the detailed examination of the body,"
answer = query_rag(question)
print("Question:", question)
print("Answer:", answer)