In [15]:
import chromadb
from sentence_transformers import SentenceTransformer, util
import pdfplumber
import nltk
from nltk.corpus import stopwords

# Step 1: Setup NLTK stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 2: Initialize the embedding model and ChromaDB client
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()

# Step 3: Read and split text from the PDF
pdf_text = []
with pdfplumber.open("questions.pdf") as pdf:
    for page in pdf.pages:
        pdf_text.append(page.extract_text())

# Step 4: Tokenize the text into sentences and remove stop words
sentences = []
for text in pdf_text:
    # Split text into sentences using simple split (you can use nltk or regex for better accuracy)
    sentences += [sentence for sentence in text.split('.') if sentence]

# Remove stop words from sentences
sentences = [" ".join([word for word in sentence.split() if word.lower() not in stop_words]) for sentence in sentences]

# Step 5: Take user input
user_query = input("Enter your search query: ")
cleaned_query = " ".join([word for word in user_query.split() if word.lower() not in stop_words])  # Remove stop words from query

# Step 6: Embed sentences and the query
query_embedding = embed_model.encode(cleaned_query)
sentence_embeddings = embed_model.encode(sentences)

# Step 7: Calculate cosine similarity and find the closest sentence
similarity_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)

# Find the index of the most similar sentence
most_similar_index = similarity_scores.argmax()
most_similar_sentence = sentences[most_similar_index]

# Step 8: Print the most similar sentence
print("Most similar sentence:", most_similar_sentence)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter your search query: culture
Most similar sentence: Campus Lifeand Culture: - arethemajorcultural eventsandfestivalscelebrated oncampus? - Arethere anystudentclubsor societiesIcan join? HowdoI becomeamember? - Howdoestheuniversity supportextracurricularactivitiesandhobbies? - Arethere anyvolunteeringopportunities available students? 2
