In [2]:
import os 
import glob
from dotenv import load_dotenv
import gradio as gr

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [4]:
import ollama
from langchain_community.llms import Ollama
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [5]:
import re
from pdfminer.high_level import extract_text
from langchain.docstore.document import Document

In [6]:
def clean_pdf_text_linewise(pdf_path):
    raw_text = extract_text(pdf_path)
    lines = raw_text.splitlines()
    line_freq = {}
    for line in lines:
        stripped = line.strip()
        line_freq[stripped] = line_freq.get(stripped, 0) + 1
    repeating_lines = {line for line, count in line_freq.items() if count > 5} 
    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped or stripped in repeating_lines:
            continue
        stripped = re.sub(r'^\d+[\.\)\-]?\s+', '', stripped)
        stripped = re.sub(r'https?://\S+', '', stripped)  
        stripped = re.sub(r'www\.\S+', '', stripped)      
        cleaned_lines.append(stripped)
    cleaned_text = "\n".join(cleaned_lines)
    cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text)  # Collapse multiple blank lines
    return cleaned_text.strip()

In [7]:
pdf_path = "AI_train.pdf"
cleaned_text = clean_pdf_text_linewise(pdf_path)
doc = Document(page_content=cleaned_text, metadata={"source": pdf_path})

In [None]:
doc

In [9]:
text_splitter = CharacterTextSplitter(
          separator="\n",
          chunk_size=1000,
          chunk_overlap=200,
          length_function=len
)
chunks = text_splitter.split_documents([doc])

In [10]:
len(chunks)

85

In [None]:
chunks

In [None]:
pip install sentence-transformers

In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [12]:
db_name="vector_db1"

In [14]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [15]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 85 documents


In [16]:
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


In [17]:
from langchain_core.prompts import PromptTemplate

custom_pdf_prompt = PromptTemplate.from_template("""
You are a helpful and concise assistant. The user has a question related to the contents of a PDF document.

Use **only** the extracted excerpts below to answer the question. Do not use any outside knowledge, and avoid speculation. 
If the answer is not explicitly stated or cannot be clearly inferred from the excerpts, respond with:
"The information is not available in the provided document."

If applicable, reference or quote relevant excerpts to support your answer.

-------------------- EXCERPT FROM PDF --------------------
{context}
--------------------- END OF EXCERPT ---------------------

User's Question: {question}

Answer (based solely on the PDF content):
""")


In [18]:
llm = Ollama(model="llama3.2")
retriever = vectorstore.as_retriever(search_kwargs={"k":10})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
rag_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": custom_pdf_prompt}
)

  llm = Ollama(model="llama3.2")
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [19]:
# query = "Can you describe ebay in a few sentences"
# result = rag_chain.invoke({"question":query})
# print(result["answer"])

eBay is a marketplace that allows users to offer, sell, and buy goods and services in various geographic locations using a variety of pricing formats. It provides a platform for buyers and sellers to interact with each other, but eBay itself does not act as an intermediary in the sale process.


In [20]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, combine_docs_chain_kwargs={"prompt": custom_pdf_prompt})