In [2]:
import tqdm as notebook_tqdm
import glob
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
pdf_files = glob.glob("../data/ml_pdfs/*.pdf")

def extract_text(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

documents = [extract_text(f) for f in pdf_files]
full_text = " ".join(documents)

print("Total characters:", len(full_text))

Total characters: 61015


In [4]:
def fixed_chunk(text, size=500, overlap=100):
    chunks = []
    for i in range(0, len(text), size-overlap):
        chunks.append(text[i:i+size])
    return chunks

fixed_chunks = fixed_chunk(full_text)
print("Fixed chunks:", len(fixed_chunks))

Fixed chunks: 153


In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
def sentence_chunk(text, max_size=500):
    sentences = sent_tokenize(text)
    chunks = []
    current = ""
    for s in sentences:
        if len(current) + len(s) < max_size:
            current += " " + s
        else:
            chunks.append(current)
            current = s
    if current:
        chunks.append(current)
    return chunks

sentence_chunks = sentence_chunk(full_text)
print("Sentence chunks:", len(sentence_chunks))

Sentence chunks: 136


In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")

fixed_embeddings = model.encode(fixed_chunks)
sentence_embeddings = model.encode(sentence_chunks)

Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 383.79it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [8]:
client = chromadb.Client()
collection = client.create_collection("ml")

for i, chunk in enumerate(fixed_chunks):
    collection.add(
        documents=[chunk],
        embeddings=[fixed_embeddings[i].tolist()],
        ids=[str(i)]
    )

InvalidArgumentError: Validation error: name: Expected a name containing 3-512 characters from [a-zA-Z0-9._-], starting and ending with a character in [a-zA-Z0-9]. Got: ml

In [9]:
import chromadb

client = chromadb.Client()
collection = client.create_collection("ml_data")  # ← fixed name

for i, chunk in enumerate(fixed_chunks):
    collection.add(
        documents=[chunk],
        embeddings=[fixed_embeddings[i].tolist()],
        ids=[str(i)]
    )

In [10]:
def retrieve(query, top_k=3):
    query_embedding = model.encode([query])[0]
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k
    )
    return results["documents"][0]

In [11]:
questions = [
    "What is overfitting?",
    "Explain bias-variance tradeoff.",
    "What is regularization?",
    "What is gradient descent?",
    "What is cross validation?",
    "Difference between classification and regression?",
    "What is supervised learning?",
    "What is feature engineering?",
    "What is model evaluation?",
    "What is logistic regression?"
]

In [12]:
for q in questions:
    print("QUESTION:", q)
    results = retrieve(q)
    print("ANSWER (retrieved context):")
    print(results[0])
    print("="*60)

QUESTION: What is overfitting?
ANSWER (retrieved context):

answers (that is, the correct outputs), the algorithm iteratively makes predictions on the training data 
and is corrected by the teache r. Learning stops when the algorithm achieves an acceptable level of 
performance. 
 
Example 
Consider the following data regarding patients entering a clinic. The data consists of the gender 
and age of the patients and each patient is labeled as “healthy” or “sick”. 
 
 
 
1.5.2 Unsupervised learning 
Correct responses are not provided, but instead the algo
QUESTION: Explain bias-variance tradeoff.
ANSWER (retrieved context):
mpling Theory: Error Estimation and Estimating Binomial Proportions, The 
Binomial Distribution, Estimators, Bias, and Variance   
 
 
UNIT V:  
Genetic Algorithms:  Motivation, Genetic Algorithms : Representing Hypotheses , Genetic Operator, 
Fitness Function and Selection,  An Illustrative  Example, Hypothesis Space Search, Genetic 
Programming, Models of Evolution 