In [19]:
pip install langchain langchain_community unstructured langchain-chroma



In [1]:
!pip install langchain-text-splitters==0.2.2 langchain-huggingface==0.0.3 unstructured==0.15.0 unstructured[pdf]==0.15.0 nltk==3.8.1



In [11]:
%pip install --upgrade --quiet unstructured

In [2]:
import os
import time
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from google.colab import userdata
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain


load_dotenv()

os.environ['GROQ_API_KEY'] = userdata.get('groqkey')


def process_url(url):
    """
    Process the URL: load documents, split text, and create vector store.
    """
    print(f"Loading data from {url}...")
    loader = WebBaseLoader(url)
    data = loader.load()

    if not data:
        print(f"Failed to load data from {url}")
        return None

    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    print("Splitting text into chunks...")
    docs = text_splitter.split_documents(data)

    embedding = HuggingFaceEmbeddings()

    vectorstore = Chroma.from_documents(docs, embedding)
    print("Data indexed and vectorstore created.")
    time.sleep(2)
    return vectorstore

def query_vectorstore(query, vectorstore):
    """
    Query the vectorstore and get the answer.
    """
    llm = ChatGroq(temperature=0.9)
    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
    result = chain.invoke({"question": query})
    return result

def main():
    url = input("Please enter the URL to process: ").strip()

    if url:
        process_urls_flag = input(f"Do you want to process the URL: {url}? (yes/no): ").strip().lower()
        if process_urls_flag == "yes":
            vectorstore = process_url(url)
            if vectorstore:
                print("Processing completed. FAISS index created.")
            else:
                print(f"Failed to process {url}.")
                return
        else:
            print("Skipping URL processing.")
            return

        while True:
            query = input("Please enter your question (or type 'exit' to quit): ").strip()
            if query.lower() == 'exit':
                print("Exiting...")
                break
            if query:
                result = query_vectorstore(query, vectorstore)
                if result:
                    print("\nAnswer:")
                    print(result["answer"])
                else:
                    print("No answer found.")

if __name__ == "__main__":
    main()



Please enter the URL to process: http://www.stanford.edu/
Do you want to process the URL: http://www.stanford.edu/? (yes/no): yes
Loading data from http://www.stanford.edu/...
Splitting text into chunks...


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Data indexed and vectorstore created.
Processing completed. FAISS index created.
Please enter your question (or type 'exit' to quit): what is in the website


Token indices sequence length is longer than the specified maximum sequence length for this model (2122 > 1024). Running this sequence through the model will result in indexing errors



Answer:
The website StanfordUniversity.edu contains information about Stanford University, including academics, research, health care, online learning, admission, financial aid, student life, and recreation & wellness. There is a sponsored research budget of $1.98 Billion and a focus on precision health through Stanford Medicine, Stanford Health Care, and Stanford Children’s Health. The website also includes news, stories about people and research, information on science & engineering, and a list of awards.


Please enter your question (or type 'exit' to quit): what are courses by stanford

Answer:
Stanford University offers a variety of online courses and specializations through the Stanford Center for Professional Development, Stanford Online, Stanford Lagunita, iTunes U, and other platforms. However, the specific courses are not mentioned in the provided document. To find specific courses, you may need to visit the Stanford Engineering online learning websites provided.


Please en

In [3]:
pip install pymupdf



In [4]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

GROQ_API_KEY = "userdata.get('groqkey')"
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

pdf_path = "/content/a.pdf"

loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

text_splitter = CharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500
)

text_chunks = text_splitter.split_documents(documents)

embedding = HuggingFaceEmbeddings()

persist_directory = "doc_db"
vectorstore = Chroma.from_documents(
    documents=text_chunks,
    embedding=embedding,
    persist_directory=persist_directory
)

retriever = vectorstore.as_retriever()

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

def chatbot():
    print("Hello! I can help you answer questions from the PDF. Type 'exit' to quit.")
    while True:
        query = input("\nYour question: ").strip()
        if query.lower() == "exit":
            print("Goodbye!")
            break
        response = qa_chain.invoke({"query": query})
        answer = response.get("result", "Sorry, I couldn't find an answer.")
        print("\nAnswer:", answer)

if __name__ == "__main__":
    chatbot()


Hello! I can help you answer questions from the PDF. Type 'exit' to quit.

Your question: what is the us gdp of manufacturing in 2011

Answer: According to the table, the US GDP of manufacturing in 2011 is 5581942 million dollars.

Your question: exit
Goodbye!
