In [45]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import torch

from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

In [47]:
from huggingface_hub import login

hf_token = "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"
login(hf_token)

In [2]:
torch.cuda.is_available()

True

In [3]:
loader = PyPDFLoader("hp3.pdf")
pages = loader.load()

In [4]:
#chunking strategy 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)

In [12]:
chunks = text_splitter.split_documents(pages)

In [13]:
len(chunks)

1585

In [14]:
chunks[1580]

Document(metadata={'producer': 'calibre (0.8.8) [http://calibre-ebook.com]', 'creator': 'calibre (0.8.8) [http://calibre-ebook.com]', 'creationdate': '2021-07-21T13:07:31+00:00', 'author': 'J. K. Rowling & Mary Grandpré', 'keywords': 'Fiction, Large Type Books, Juvenile Fiction, Magic, Wizards, Schools, England, Humorous Stories, Fantasy & Magic, School & Education', 'moddate': '2021-07-21T13:07:31+00:00', 'title': 'Harry Potter 3 - Harry Potter and the Prisoner of Azkaban', 'source': 'hp3.pdf', 'total_pages': 379, 'page': 377, 'page_label': '378'}, page_content='Ron’s eyes widened. The minute owl was still hooting\nexcitedly.\n“Keep him?” he said uncertainly. He looked closely at the\nowl for a moment; then, to Harry’s and Hermione’s great\nsurprise, he held him out for Crookshanks to sniﬀ.\n“What do’you reckon?” Ron asked the cat. “Deﬁnitely an\nowl?”\nCrookshanks purred.\n“That’s good enough for me,” said Ron happily. “He’s\nmine.”\nHarry read and reread the letter from Sirius all t

In [34]:
model_kwargs = {'device': 'cuda'}
embedding_model = HuggingFaceEmbeddings(model_name = "intfloat/e5-base-v2",
                                       model_kwargs=model_kwargs)

In [None]:
#doc_embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])

In [35]:
chunk_list = []

for chunk in chunks:
    chunk_list.append(chunk.page_content)

In [36]:
chunk_embeddings = embedding_model.embed_documents(chunk_list)

In [37]:
len(chunk_embeddings[1])

768

In [38]:
vector_db = FAISS.from_documents(chunks, embedding_model)

In [39]:
vector_db.save_local("faiss_index_db")

In [58]:
query = "Who was Sirius Black?"
query_embedding = embedding_model.embed_query(query)

In [59]:
retrieved_similar_chunks = vector_db.similarity_search(query, k=3) #Top 3 relevant chunks

In [60]:
retrieved_similar_chunks

[Document(id='f7396f94-d364-49a3-b3b7-f6ed2c8c679f', metadata={'producer': 'calibre (0.8.8) [http://calibre-ebook.com]', 'creator': 'calibre (0.8.8) [http://calibre-ebook.com]', 'creationdate': '2021-07-21T13:07:31+00:00', 'author': 'J. K. Rowling & Mary Grandpré', 'keywords': 'Fiction, Large Type Books, Juvenile Fiction, Magic, Wizards, Schools, England, Humorous Stories, Fantasy & Magic, School & Education', 'moddate': '2021-07-21T13:07:31+00:00', 'title': 'Harry Potter 3 - Harry Potter and the Prisoner of Azkaban', 'source': 'hp3.pdf', 'total_pages': 379, 'page': 176, 'page_label': '177'}, page_content='Black’s capable of. …”\n“Do you know, I still have trouble believing it,” said\nMadam Rosmerta thoughtfully. “Of all the people to go over\nto the Dark Side, Sirius Black was the last I’d have thought\n… I mean, I remember him when he was a boy at Hogwarts.\nIf you’d told me then what he was going to become, I’d have\nsaid you’d had too much mead.”\n“You don’t know the half of it, Ro

In [48]:
pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",  # replace with "mps" to run on a Mac device
)


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda


In [61]:
context = ""
for i in retrieved_similar_chunks:
    context = context + i.page_content + "\n"

In [62]:
book_name = "Prisoners of Azkaban" 
user_question = query
prompt = f'''
Use the following context from Harry Potter : {book_name} to answer the questions

{context}

Question : {user_question}
'''

In [63]:
messages = [
    {"role": "user", "content": prompt},
]

outputs = pipe(messages, max_new_tokens=256)
assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
print(assistant_response)

Sirius Black is a character in the Harry Potter series. He is a close friend of James Potter and Remus Lupin, and is also Harry's godfather. 

The context you provided reveals that Sirius was a former friend of Harry's parents, and was wrongly accused of being a dangerous criminal.
