****Load and Preprocess Data****

In [1]:
#We'll use text data (e.g., PDFs, Markdown files) and convert them into vector embeddings

In [44]:
# from langchain.document_loaders import TextLoader 
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# # Load text documents
# loader = TextLoader("Crypto_Encryption.pdf")  # Replace with your file
# docs = loader.load()

# Load PDF file
loader = PyMuPDFLoader("Fantastic family hotel_review.pdf")
docs = loader.load()

# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.split_documents(docs)

for doc in documents:
    print(doc.page_content)  # Prints only the extracted text


Fantastic family hotel 
The Lost City Hotel, Dubai, UAE 
11/25/2018 
This Hotel is fantastic. I have stayed at a number of 5 star hotels and I can truly say this is a great 
hotel. It may not be subtle but there is nothiong cheap or tacky about the fitting out. Here are a few 
fab things about it:1. The interior design is impressive2. The staff and service are excellent - always 
there when you need them and never over the top - and they love kids 3. The half board restaurants
are the best I have seen in the world. Safron has an amazing array of quality foods inccluding a full 
Dim Sum bar and a fish counter with as much lobster on ice as you fancy and a chocolate fountain to 
put fruit on a stick into. 4. You can get a arabian style tent by the pool for an extra charge which is 
fantastic for the kids with full shade, bean bags and a toy chest of board games etc. 5. The water
park is great fun for all the family and is a nice mix to spend a few hours there and then back to the 
quiet 

**#Convert Text to Vector Embeddings**

In [9]:
#We'll use Hugging Face embeddings

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectors = embedding_model.embed_documents([doc.page_content for doc in documents])


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
print (vectors)
print(type(vectors))

[[0.04228374734520912, -0.00121607119217515, 0.010572902858257294, 0.10514087229967117, -0.051728080958127975, 0.07137061655521393, -0.01789899542927742, -0.05861427262425423, 0.046502336859703064, -0.014561470597982407, 0.04523598775267601, -0.07239063084125519, 0.12002267688512802, -0.007785748224705458, 0.04596809670329094, -0.05276365578174591, 0.0839981958270073, -0.06974755227565765, 0.11875966936349869, -0.09302471578121185, -0.06584137678146362, -0.00231244508177042, 0.04533674195408821, -0.010524832643568516, -0.03109215572476387, 0.07119133323431015, 0.01776944287121296, 0.06624025851488113, -0.03789560869336128, -0.06437001377344131, 0.00645821075886488, 0.11948733776807785, 0.005016718991100788, -0.030911600217223167, 0.047827914357185364, 0.13955451548099518, -0.02299565263092518, -0.06923990696668625, 0.004304250702261925, -0.02245198003947735, 0.05551778897643089, 0.05685687065124512, 0.07227253913879395, -0.032940708100795746, 0.05131571739912033, -0.033392470329999924,

In [16]:
print(len(vectors))  # Number of vectors (documents)
print(len(vectors[0]))  # Dimension of each vector


5
384


**Store Embeddings in FAISS**

In [18]:
import faiss
import numpy as np

# Convert list of embeddings into NumPy array
vector_dim = len(vectors[0])
index = faiss.IndexFlatL2(vector_dim)  # L2 Distance-based index
index.add(np.array(vectors))

# Save FAISS index for later use
faiss.write_index(index, "faiss_index.bin")

**Optional:How to read faiss_index.bin file**

In [None]:
# Step 1: Load FAISS Index
import faiss
import numpy as np
# Load the FAISS index from the file
index = faiss.read_index("faiss_index.bin")
# Check the number of vectors stored
print("Number of stored vectors:", index.ntotal)

#Step 2: Retrieve a Sample Vector
# Create a random query vector (same dimension as stored vectors)
d = index.d  # Get the dimension of vectors
query_vector = np.random.rand(1, d).astype('float32')
# Search for the nearest neighbor
D, I = index.search(query_vector, k=5)  # Find top 5 closest vectors
print("Nearest Neighbor Indices:", I)
print("Nearest Neighbor Distances:", D)

#Step 3: Retrieve an Exact Stored Vector
# Retrieve vectors stored at specific indices
stored_vectors = index.reconstruct(0)  # Fetch vector at index 0
print("Vector at index 0:", stored_vectors)

#Step 4: Export FAISS Index to a Readable Format
import pandas as pd

# Get all vectors
stored_vectors = np.array([index.reconstruct(i) for i in range(index.ntotal)])

# Save to CSV
df = pd.DataFrame(stored_vectors)
df.to_csv("faiss_vectors.csv", index=False)
print("Vectors saved to faiss_vectors.csv")



**Perform Retrieval on User Query**

In [28]:
query = "who is mohamed salah player football?"
query_vector = np.array(embedding_model.embed_query(query))

# Retrieve top 3 similar documents
D, I = index.search(query_vector.reshape(1, -1), 3)

# Display results
retrieved_docs = [documents[i].page_content for i in I[0]]
print("\n".join(retrieved_docs))

are the best I have seen in the world. Safron has an amazing array of quality foods inccluding a full 
Dim Sum bar and a fish counter with as much lobster on ice as you fancy and a chocolate fountain to 
put fruit on a stick into. 4. You can get a arabian style tent by the pool for an extra charge which is 
fantastic for the kids with full shade, bean bags and a toy chest of board games etc. 5. The water
Fantastic family hotel 
The Lost City Hotel, Dubai, UAE 
11/25/2018 
This Hotel is fantastic. I have stayed at a number of 5 star hotels and I can truly say this is a great 
hotel. It may not be subtle but there is nothiong cheap or tacky about the fitting out. Here are a few 
fab things about it:1. The interior design is impressive2. The staff and service are excellent - always 
there when you need them and never over the top - and they love kids 3. The half board restaurants
for a stress free fun time with a luxury feel.For our own experience, we paid the extra money to 
upgrade to a

In [45]:
print(documents)

[Document(metadata={'producer': '䵩捲潳潦璮⁗潲搠景爠佦晩捥″㘵㬠浯摩晩敤⁵獩湧\u2069呥硴卨慲瀠㐮ㄮ㘠批‱吳塔', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-13T19:42:16-07:00', 'source': 'Fantastic family hotel_review.pdf', 'file_path': 'Fantastic family hotel_review.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': 'Review', 'author': 'Reviewer', 'subject': 'Review', 'keywords': '', 'moddate': '2020-07-02T08:18:35-07:00', 'trapped': '', 'modDate': "D:20200702081835-07'00'", 'creationDate': "D:20190413194216-07'00'", 'page': 0}, page_content='Fantastic family hotel \nThe Lost City Hotel, Dubai, UAE \n11/25/2018 \nThis Hotel is fantastic. I have stayed at a number of 5 star hotels and I can truly say this is a great \nhotel. It may not be subtle but there is nothiong cheap or tacky about the fitting out. Here are a few \nfab things about it:1. The interior design is impressive2. The staff and service are excellent - always \nthere when you need them and never over the top - and they love kids

**Generate Response Using GPT0**

In [50]:
import os
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

# ✅ Load API Key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("API Key not found! Add it to your .env file.")

# ✅ Initialize OpenAI embeddings
embedding_model = OpenAIEmbeddings(api_key=api_key)

# ✅ Extract text content
documents = [doc.page_content for doc in split_docs]

# ✅ Convert documents to embeddings (Ensure float32 type)
doc_vectors = np.array(embedding_model.embed_documents(documents), dtype=np.float32)

# ✅ Ensure FAISS directory exists
faiss_index_path = "faiss_index"
os.makedirs(faiss_index_path, exist_ok=True)

# ✅ Create FAISS index
vector_dim = doc_vectors.shape[1]  # Get embedding size
index = faiss.IndexFlatL2(vector_dim)  # L2 distance-based FAISS index
index.add(doc_vectors)  # Add document vectors

# ✅ Save FAISS using LangChain (Creates `index.faiss` and `index.pkl`)
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.storage import LocalFileStore

# ✅ Use FAISS.from_documents to properly save the FAISS index
from langchain.schema import Document

# ✅ Convert extracted text into `Document` objects
doc_objects = [Document(page_content=text) for text in documents]

# ✅ Save FAISS index correctly
vector_db = FAISS.from_documents(doc_objects, embedding_model)


# ✅ Save FAISS with both `index.faiss` and `index.pkl`
vector_db.save_local(faiss_index_path)

print("✅ FAISS index saved correctly with both `index.faiss` and `index.pkl`!")

# ✅ Load FAISS vector database safely
vector_db = FAISS.load_local(faiss_index_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
retriever = vector_db.as_retriever()

print("✅ FAISS Index Loaded Successfully!")


✅ FAISS index saved correctly with both `index.faiss` and `index.pkl`!
✅ FAISS Index Loaded Successfully!


**Run a Query Using the FAISS Index**

In [57]:
# ✅ Example Query
query = "what is location of hotel"
response = retriever.get_relevant_documents(query, search_kwargs={"k": 2})


# ✅ Print Retrieved Documents
print("\nRetrieved Documents:\n")
for i, doc in enumerate(response, start=1):
    print(f"Document {i}:\n{doc.page_content}\n" + "-"*50)



Retrieved Documents:

Document 1:
Fantastic family hotel 
The Lost City Hotel, Dubai, UAE 
11/25/2018 
This Hotel is fantastic. I have stayed at a number of 5 star hotels and I can truly say this is a great 
hotel. It may not be subtle but there is nothiong cheap or tacky about the fitting out. Here are a few 
fab things about it:1. The interior design is impressive2. The staff and service are excellent - always 
there when you need them and never over the top - and they love kids 3. The half board restaurants
--------------------------------------------------
Document 2:
park is great fun for all the family and is a nice mix to spend a few hours there and then back to the 
quiet of the hotel pool.The hotel pools are great with a zero entry one ideal for toddlers 6. Safety is a 
given with about 5 life guards per pool and security staff all over the hotel.It does not have the 
intamacy of a small luxury hotel but for a family holiday with young kids it has everything you need
--------