In [51]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec,CloudProvider,VectorType,AwsRegion
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
import os

load_dotenv()

True

In [13]:
def load_documents_from_pdf(directory_path):
    """ Return a list of documents extracted from PDF files in the specifed directory."""

    loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()

    return documents


In [14]:
docs = load_documents_from_pdf("../data/")

In [76]:
def split_into_chunks(documents):
    """ Split the documents into smaller chunks for processing."""

    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n","\n"],
        chunk_size = 200,
        chunk_overlap = 100
    )

    chunks = splitter.split_documents(documents)
    return chunks

In [77]:
chunks = split_into_chunks(docs)

In [78]:
len(chunks)

4860

In [79]:
chunks[0]

Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2024-09-19T22:02:52+00:00', 'source': '..\\data\\Explorations in Artificial Intelligence and Machine Learning, CRC Press.pdf', 'total_pages': 178, 'page': 0, 'page_label': '1'}, page_content='Explorations in Artificial \nIntelligence and M achine \nLearning\nA CRC Press FreeBook')

In [52]:
def load_embeddings():
    """ Load the embeddings model for vectorization."""

    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    return embeddings

    

In [53]:
print(load_embeddings())

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} query_encode_kwargs={} multi_process=False show_progress=False


In [54]:
embeddings =load_embeddings()

response = embeddings.embed_query("What is LLM?")

In [56]:
len(response)

384

In [57]:
index_name="aibot"

In [58]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [59]:
def create_pinecone_index(index_name):
    """ Create a Pinecone index for storing vector embeddings."""

    pc = Pinecone(api_key=PINECONE_API_KEY)


    if not pc.has_index(index_name):
        pc.create_index(
        name=index_name,
        dimension=384,
        spec=ServerlessSpec(
        cloud=CloudProvider.AWS,
        region=AwsRegion.US_EAST_1
        ),
        vector_type=VectorType.DENSE
        )
        print(f"Index {index_name} created successfully.")
    else:
        print(f"Index {index_name} already exists.")



In [110]:
create_pinecone_index(index_name)

Index aibot created successfully.


In [105]:
from tqdm import tqdm

vectors = []
failed_indexes = []

for i in tqdm(range(0, len(cleaned_texts))):
    try:
        vec = embeddings.embed_query(cleaned_texts[i])
        vectors.append(vec)
    except Exception as e:
        failed_indexes.append((i, cleaned_texts[i], str(e)))


100%|██████████| 4826/4826 [02:29<00:00, 32.31it/s]


In [106]:
print(f"❌ Total failed embeddings: {len(failed_indexes)}")
for idx, text, err in failed_indexes[:5]:
    print(f"[{idx}] Error: {err} | Text: {repr(text[:80])}")


❌ Total failed embeddings: 0


In [99]:
for i, text in enumerate(cleaned_texts):
    try:
        vec = embeddings.embed_query(text)
    except Exception as e:
        print(f"❌ Error at index {i}: {e} | Text: {repr(text[:60])}")
        break

❌ Error at index 1307: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]] | Text: 'we provide a quick overview of this crucial topic.\nThe notat'


In [111]:
from langchain_core.documents import Document
import unicodedata

def clean_metadata(metadata: dict) -> dict:
    clean_meta = {}
    for key, value in metadata.items():
        if isinstance(value, str):
            clean_meta[key] = value.encode("utf-8", "ignore").decode("utf-8").strip()
        else:
            clean_meta[key] = value
    return clean_meta



def clean_text(text):
    # Remove unprintable and invalid unicode characters
    if not isinstance(text, str):
        return ""
    
    # Normalize and remove surrogates
    try:
        text = text.encode("utf-8", "replace").decode("utf-8")
    except:
        text = str(text)
    
    # Optionally remove control characters
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C' or ch in '\n\t ')
    
    return text.strip()

cleaned_docs = []
for doc in chunks:
    if isinstance(doc.page_content, str) and len(doc.page_content.strip()) > 10:
        cleaned_text = clean_text(doc.page_content)
        cleaned_meta = clean_metadata(doc.metadata)
        cleaned_docs.append(Document(page_content=cleaned_text, metadata=cleaned_meta))


In [112]:
len(cleaned_docs)

4826

In [113]:
vectorstore = PineconeVectorStore.from_documents(
    documents=cleaned_docs,
    index_name=index_name,
    embedding=embeddings
)

In [114]:
# Retrieve documents from the vector store

retrieved_docs = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [125]:
retriever = retrieved_docs.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [159]:
from langchain_mistralai import ChatMistralAI
from langchain.prompts import ChatPromptTemplate

prompt = '''
    You are a helpful question answering assistant.
    You will be provided with a question and you need to answer it based on the context provided.
    If you don't know the answer or if the question is out of context,say "I don't know."
    Do not make up the answer or hallucinate. Also don't answer questions that are not related to the context provided.
    Answer in maximum 3 sentences within the context  and keep it concise.
    \n\n
    {context}
'''

In [160]:
final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", prompt),
        ("human", "{input}")
    ]
)

In [149]:
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")

In [161]:
llm = ChatMistralAI(
    model_name="mistral-large-latest",
    max_tokens=500

)

In [162]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [163]:
qa_chain = create_stuff_documents_chain(llm=llm, prompt=final_prompt)
rag_chain =create_retrieval_chain(retriever,qa_chain)

In [153]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response)
print(response["answer"])

{'input': 'What is stats?', 'context': [Document(id='7fcfe7a8-4a5e-475f-8517-3066d62dbfa8', metadata={'creationdate': '', 'creator': 'PyPDF', 'page': 13.0, 'page_label': '14', 'producer': 'Skia/PDF m133 Google Docs Renderer', 'source': "..\\data\\Generative AI_ A Beginner's Guide.pdf", 'title': "Generative AI: A Beginner's Guide", 'total_pages': 62.0}, page_content='similar\n \nto\n \nthe\n \ndata\n \nthey\n \nwere\n \ntrained\n \non.\n \nThis\n \nallows\n \nthem\n \nto\n \nperform\n \na\n \nvariety\n \nof\n \nimpressive\n \ntasks,\n \nincluding:'), Document(id='b55a1a10-1d12-4c48-b577-882820549f81', metadata={'creationdate': '', 'creator': 'PyPDF', 'moddate': '2024-09-19T22:02:52+00:00', 'page': 24.0, 'page_label': '25', 'producer': 'Pdftools SDK', 'source': '..\\data\\Explorations in Artificial Intelligence and Machine Learning, CRC Press.pdf', 'total_pages': 178.0}, page_content='in the second example, we have much more data (the results of 100 tosses rather\nthan 10) and so we shou

In [164]:
response = rag_chain.invoke({"input": "What is NLP?"})
print(response["answer"])

NLPis a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language. The goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful.


In [165]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])

I don't know.


In [166]:
response = rag_chain.invoke({"input": "What is headache meaning?"})
print(response)
print(response["answer"])

{'input': 'What is headache meaning?', 'context': [Document(id='c45997a2-ca4b-4eaf-91bc-2efc22824c75', metadata={'creationdate': '', 'creator': 'PyPDF', 'moddate': '2024-09-19T22:02:52+00:00', 'page': 9.0, 'page_label': '10', 'producer': 'Pdftools SDK', 'source': '..\\data\\Explorations in Artificial Intelligence and Machine Learning, CRC Press.pdf', 'total_pages': 178.0}, page_content='that we label as intelligent by learning from experience. Learning is what gives us ﬂexibility\nin our life; the fact that we can adjust and adapt to new circumstances, and learn new'), Document(id='172f5b51-f67e-4caf-a817-9b85f35ef1c1', metadata={'creationdate': '', 'creator': 'PyPDF', 'moddate': '2024-09-19T22:02:52+00:00', 'page': 172.0, 'page_label': '173', 'producer': 'Pdftools SDK', 'source': '..\\data\\Explorations in Artificial Intelligence and Machine Learning, CRC Press.pdf', 'total_pages': 178.0}, page_content='between a feeling and a thought is. Feeling pain and knowing about pain \nare cert

In [167]:
response = rag_chain.invoke({"input": "What is prompt engineering?"})
print(response["answer"])

Prompt engineering is the art and science of crafting effective prompts to guide and control the context and specific goals for a model, ultimately influencing the quality and direction of its outputs. It focuses on specific functionalities and requirements.


In [168]:
response = rag_chain.invoke({"input": "What are features of Java?"})
print(response["answer"])

I don't know.


In [169]:
response = rag_chain.invoke({"input": "What is LLm?"})
print(response["answer"])

LLM stands for Large Language Model. It is a type of artificial intelligence model designed to understand and generate human language based on input data. LLMs are foundational to many generative AI applications.
