<a href="https://colab.research.google.com/github/epicskills1/PdfQuery-Application-/blob/main/Pdf_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers torch PyPDF2 langchain langchain-community sentence-transformers chromadb streamlit pyngrok huggingface_hub

# Set Hugging Face token
import os
os.environ['HUGGINGFACE_TOKEN'] = "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"

# Login to Hugging Face
from huggingface_hub import login
login(token=os.getenv('HUGGINGFACE_TOKEN'))


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain
  Downloading langchain-0.3.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting chromadb
  Downloading chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting streamlit
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-core<0.4.0,>=0.3.8 (from langchain)
  Downloading langchain_core-0.3.9-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.131-py

In [None]:
%%writefile pdf_qa_app.py
# Import libraries
import streamlit as st
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from pyngrok import ngrok
import warnings
import logging

# Suppress Streamlit warnings
logging.getLogger('streamlit').setLevel(logging.ERROR)

# Ignore warnings
warnings.filterwarnings("ignore")


# Load the model and tokenizer
checkpoint = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token='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')  # Replace with your token
base_model = AutoModelForCausalLM.from_pretrained(checkpoint, token='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')  # Replace with your token

@st.cache_resource
def llm_pipeline():
    pipe = pipeline(
        'text-generation',
        model=base_model,
        tokenizer=tokenizer,
        max_length=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.95
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    return local_llm

@st.cache_resource
def qa_llm():
    llm = llm_pipeline()
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(persist_directory="db", embedding_function=embeddings)
    retriever = db.as_retriever()
    qa = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
    return qa

def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()
    return text

def process_answer(instruction):
    qa = qa_llm()
    generated_text = qa(instruction)
    answer = generated_text['result']
    return answer, generated_text

def main():
    st.title("Search Your PDF 🐦📄")

    with st.expander("About the App"):
        st.markdown("""This is a Generative AI powered Question and Answering app that responds to questions about your PDF File.""")

    uploaded_pdf = st.file_uploader("Upload your PDF", type="pdf")

    if uploaded_pdf is not None:
        with st.spinner("Extracting text from PDF..."):
            pdf_text = extract_text_from_pdf(uploaded_pdf)
            st.success("Text extracted from PDF!")

        question = st.text_area("Enter your Question")

        if st.button("Ask"):
            st.info("Your Question: " + question)
            st.info("Your Answer")
            answer, metadata = process_answer(question)
            st.write(answer)
            st.write(metadata)

if __name__ == '__main__':
    main()



Writing pdf_qa_app.py


In [None]:

!pip install --upgrade pyngrok
from pyngrok import ngrok
ngrok.set_auth_token("2n6Wbn6xiLAsyySCmFSos1gG0Ja_68ctrYXtCfCGaJkPZ896u")

# # Terminate all existing ngrok tunnels
ngrok.kill()

# # Create a new ngrok tunnel, explicitly specifying HTTP protocol
public_url = ngrok.connect(8501, proto="http") # Specify protocol as "http"
print(f"Streamlit App URL: {public_url}")

Streamlit App URL: NgrokTunnel: "https://c494-34-106-22-187.ngrok-free.app" -> "http://localhost:8501"


In [None]:

!pip install streamlit
import subprocess

# Run Streamlit app
subprocess.Popen(['streamlit', 'run', 'pdf_qa_app.py'])



<Popen: returncode: None args: ['streamlit', 'run', 'pdf_qa_app.py']>