In [4]:
import os

# List files in the current directory
print("Files in current directory:", os.listdir())


Files in current directory: ['# Example code using PyTorch.py', '.anaconda', '.android', '.cache', '.conda', '.condarc', '.continuum', '.gitconfig', '.idlerc', '.ipfs', '.ipynb_checkpoints', '.ipython', '.jupyter', '.kaggle', '.keras', '.matplotlib', '.VirtualBox', '.vscode', '.wdm', '.yarnrc', '1.NLP Intro.pdf', '2.Language Model.pdf', '3D Objects', '4.Tokenization.pdf', '9.NLTK.pdf', 'Advertising.csv', 'anaconda3', 'anaconda_projects', 'AppData', 'Application Data', 'archive (1).rar', 'bonus task.ipynb', 'BONUS.ipynb', 'bottom.jsx', 'Brochure.pdf', 'car.csv', 'cat.0.jpg', 'cat.1.jpg', 'cat.10.jpg', 'cat.11.jpg', 'cat.2.jpg', 'cat.3.jpg', 'cat.4.jpg', 'cat.5.jpg', 'cat.6.jpg', 'cat.7.jpg', 'cat.8.jpg', 'cat.9.jpg', 'Contacts', 'Cookies', 'customer_churn_data.csv', 'data', 'data.txt', 'Desktop', 'dhivya.img', 'Documents', 'dog.0.jpg', 'dog.1.jpg', 'dog.10.jpg', 'dog.11.jpg', 'dog.2.jpg', 'dog.3.jpg', 'dog.4.jpg', 'dog.5.jpg', 'dog.6.jpg', 'dog.7.jpg', 'dog.8.jpg', 'dog.9.jpg', 'Downloa

In [6]:
import shutil

# Create 'data' folder if it doesn't exist
destination_folder = "data"
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Move PDF files
for file in os.listdir():
    if file.endswith(".pdf"):  # Ensure only PDFs are moved
        shutil.move(file, os.path.join(destination_folder, file))

print("Files moved to 'data' folder successfully!")


Files moved to 'data' folder successfully!


In [10]:
print("PDFs in data folder:", os.listdir("data"))


PDFs in data folder: ['.ipynb_checkpoints', '1.NLP Intro.pdf', '2.Language Model.pdf', '4.Tokenization.pdf', '9.NLTK.pdf']


In [12]:
!pip install langchain pypdf sentence-transformers faiss-cpu



In [17]:
import os
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Step 1: Load PDF Documents from 'data' Folder
data_folder = "data"  
loader = DirectoryLoader(data_folder, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

# Step 2: Split Documents into Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

# Step 3: Create Embeddings using HuggingFace
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4: Build FAISS Vector Store
vector_store = FAISS.from_documents(docs, embedding_model)

# Step 5: Save Vector Store to a Folder
faiss_folder = "vector_store"
vector_store.save_local(faiss_folder)

print(f"FAISS vector store saved in '{faiss_folder}'")

FAISS vector store saved in 'vector_store'


In [23]:
import os

faiss_folder = "vector_store"
print("Vector Store Files:", os.listdir(faiss_folder))


Vector Store Files: ['index.faiss', 'index.pkl']


In [33]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Load FAISS Vector Store with Safe Deserialization
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.load_local("vector_store", embedding_model, allow_dangerous_deserialization=True)

# List of sample queries
queries = [
    "What is NLP?",
    "What are language models?",
    "Explain tokenization in NLP.",
    "What is NLTK used for?",
    "Describe different types of language models."
]

# Run queries and display results
for query in queries:
    print(f"\n Query: {query}")
    results = vector_store.similarity_search(query, k=2)  # Fetch top 2 matches

    for i, res in enumerate(results):
        print(f"\n Match {i+1}:")
        print(res.page_content[:500])  
    print("\n" + "-"*80)  



 Query: What is NLP?

 Match 1:
2/919AD601 - Origines and challenges of NLP /NLP /IT / SNSCE
NLP
Natural Language Processing
• Natural language processing, or NLP , is the field that involves getting systems to understand human
languages.
• Natural language processing (NLP) refers to the branch of computer science—and more specifically, the
branch of artificial intelligence or AI—concerned with giving computers the ability to understand text and
spoken words in much the same way human beings can.

 Match 2:
• NLP combines computational linguistics—rule-based modeling of human language—with statistical, machine
learning, and deep learning models. Together, these technologies enable computers to process human
language in the form of text or voice data and to ‘understand’its full meaning, complete with the speaker or
writer’sintent and sentiment

--------------------------------------------------------------------------------

 Query: What are language models?

 Match 1:
correction etc. 