In [1]:
# Simple TF-IDF retrieval system (improved version)

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import nltk

# Uncomment the next two lines if you haven't downloaded stopwords yet
# nltk.download("stopwords")
# nltk.download("punkt")

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# ========== 1. Load the CSV ==========
CSV_PATH = "documents.csv"
df = pd.read_csv(CSV_PATH)

# assume first column = document name, second column = document content
if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []
    # candidate docs must contain at least one query term
    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = 0.0
        for t, qw in q_vec.items():
            dw = d_vec.get(t)
            if dw is not None:
                dot += qw * dw
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function (returns DataFrame) ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Quick demo ==========
# Try searching your dataset
print(search("sample query"))


FileNotFoundError: [Errno 2] No such file or directory: 'documents.csv'

In [3]:
# Simple TF-IDF retrieval system (improved version, with Desktop path)

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import nltk

# Uncomment the next two lines if you haven't downloaded stopwords yet
# nltk.download("stopwords")
# nltk.download("punkt")

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# ========== 1. Load the CSV ==========
# 🔥 Replace <your-username> with your Mac username (e.g. chirath)
CSV_PATH = "/Users/chirath/Desktop/documents.csv"
df = pd.read_csv(CSV_PATH)

# assume first column = document name, second column = document content
if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []
    # candidate docs must contain at least one query term
    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = 0.0
        for t, qw in q_vec.items():
            dw = d_vec.get(t)
            if dw is not None:
                dot += qw * dw
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function (returns DataFrame) ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Quick demo ==========
print(search("sample query"))


FileNotFoundError: [Errno 2] No such file or directory: '/Users/chirath/Desktop/documents.csv'

In [1]:
# Simple TF-IDF Retrieval System (Mac Desktop Version)

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import nltk
import os

# Uncomment if stopwords or punkt are not downloaded
# nltk.download("stopwords")
# nltk.download("punkt")

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# ========== 1. Load the CSV ==========
CSV_PATH = "/Users/chirathwijeweera/Desktop/documents.csv"

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    if not query.strip():
        return []
    
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []

    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = sum(q_vec[t] * d_vec.get(t, 0.0) for t in q_vec)
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function (returns DataFrame) ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        print("No matching documents found.")
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Demo ==========
if __name__ == "__main__":
    query = input("Enter your search query: ")
    df_results = search(query)
    print("\nTop matching documents:\n", df_results)


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/chirathwijeweera/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [3]:
# Simple TF-IDF Retrieval System (Mac Desktop Version)

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import nltk
import os

# ========== 0. Ensure NLTK resources are downloaded ==========
try:
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download("stopwords")
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download("punkt")

from nltk.stem import PorterStemmer

# ========== 1. Load the CSV ==========
CSV_PATH = "/Users/chirath/Desktop/documents.csv"

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    if not query.strip():
        return []
    
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []

    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = sum(q_vec[t] * d_vec.get(t, 0.0) for t in q_vec)
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function (returns DataFrame) ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        print("No matching documents found.")
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Demo ==========
if __name__ == "__main__":
    query = input("Enter your search query: ")
    df_results = search(query)
    print("\nTop matching documents:\n", df_results)

Downloading NLTK stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chirathwijeweera/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chirathwijeweera/nltk_data...


Downloading NLTK punkt tokenizer...


[nltk_data]   Unzipping tokenizers/punkt.zip.


FileNotFoundError: CSV file not found at /Users/chirath/Desktop/documents.csv

In [5]:
# Simple TF-IDF Retrieval System (Mac Desktop Version)

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import nltk
import os

# ========== 0. Ensure NLTK resources are downloaded ==========
try:
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download("stopwords")
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download("punkt")

from nltk.stem import PorterStemmer

# ========== 1. Load the CSV ==========
CSV_PATH = "/Users/chirathwijeweera/Desktop/documents.csv"

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    if not query.strip():
        return []
    
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []

    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = sum(q_vec[t] * d_vec.get(t, 0.0) for t in q_vec)
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function (returns DataFrame) ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        print("No matching documents found.")
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Demo ==========
if __name__ == "__main__":
    query = input("Enter your search query: ")
    df_results = search(query)
    print("\nTop matching documents:\n", df_results)

Enter your search query:  machine learning


No matching documents found.

Top matching documents:
 Empty DataFrame
Columns: [Document, Score]
Index: []


In [10]:
# Simple TF-IDF Retrieval System (Offline, Mac Desktop Version)

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import os
from nltk.stem import PorterStemmer

# ========== 0. Built-in stopwords list ==========
stop_words = set("""
a about above after again against all am an and any are aren't as at be because been 
before being below between both but by can't cannot could couldn't did didn't do does 
doesn't doing don't down during each few for from further had hadn't has hasn't have 
haven't having he he'd he'll he's her here here's hers herself him himself his how 
how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most 
mustn't my myself no nor not of off on once only or other ought our ours ourselves out 
over own same shan't she she'd she'll she's should shouldn't so some such than that 
that's the their theirs them themselves then there there's these they they'd they'll 
they're they've this those through to too under until up very was wasn't we we'd we'll 
we're we've were weren't what what's when when's where where's which while who who's 
whom why why's with won't would wouldn't you you'd you'll you're you've your yours 
yourself yourselves
""".split())

# ========== 1. Load the CSV ==========
CSV_PATH = "/Users/chirathwijeweera/Desktop/documents.csv"

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    if not query.strip():
        return []
    
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []

    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = sum(q_vec[t] * d_vec.get(t, 0.0) for t in q_vec)
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function (returns DataFrame) ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        print("No matching documents found.")
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Demo ==========
if __name__ == "__main__":
    query = input("Enter your search query: ")
    df_results = search(query)
    print("\nTop matching documents:\n", df_results)

Enter your search query:  history



Top matching documents:
                   Document     Score
0  The Story Behind Banksy  0.153318


In [14]:
# ================================================
# Offline TF-IDF Document Retrieval System
# Author: Your Name
# ================================================

import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import pandas as pd
import os
from nltk.stem import PorterStemmer

# ========== 0. Built-in English stopwords ==========
stop_words = set("""
a about above after again against all am an and any are aren't as at be because been 
before being below between both but by can't cannot could couldn't did didn't do does 
doesn't doing don't down during each few for from further had hadn't has hasn't have 
haven't having he he'd he'll he's her here here's hers herself him himself his how 
how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most 
mustn't my myself no nor not of off on once only or other ought our ours ourselves out 
over own same shan't she she'd she'll she's should shouldn't so some such than that 
that's the their theirs them themselves then there there's these they they'd they'll 
they're they've this those through to too under until up very was wasn't we we'd we'll 
we're we've were weren't what what's when when's where where's which while who who's 
whom why why's with won't would wouldn't you you'd you'll you're you've your yours 
yourself yourselves
""".split())

# ========== 1. Load the CSV ==========
CSV_PATH = "/Users/chirathwijeweera/Desktop/documents.csv"

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

if df.shape[1] < 2:
    raise ValueError("CSV must have at least two columns: name, content")

doc_names = df.iloc[:, 0].astype(str).tolist()
doc_texts = df.iloc[:, 1].astype(str).tolist()
N = len(doc_texts)

# ========== 2. Tokenizer ==========
token_pattern = re.compile(r"[a-z]+")
stemmer = PorterStemmer()

def tokenize(text: str, do_stem: bool = True) -> List[str]:
    tokens = [t for t in token_pattern.findall(text.lower()) if t not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# ========== 3. Build postings & raw counts ==========
postings: Dict[str, List[int]] = defaultdict(list)
doc_term_counts: List[Counter] = []

for doc_id, text in enumerate(doc_texts):
    cnt = Counter(tokenize(text))
    doc_term_counts.append(cnt)
    for term in cnt:
        postings[term].append(doc_id)

# ========== 4. Document frequencies & IDF ==========
dfreq: Dict[str, int] = {term: len(docs) for term, docs in postings.items()}

def idf(term: str) -> float:
    df_t = dfreq.get(term, 0)
    return math.log10(N / df_t) if df_t > 0 else 0.0

idf_index: Dict[str, float] = {term: idf(term) for term in postings.keys()}

# ========== 5. TF with log scaling + TF-IDF ==========
def tf(freq: int) -> float:
    return 1.0 + math.log10(freq) if freq > 0 else 0.0

doc_tfidf: List[Dict[str, float]] = []
doc_norms: List[float] = []

for cnt in doc_term_counts:
    vec: Dict[str, float] = {}
    for term, f in cnt.items():
        w = tf(f) * idf_index.get(term, 0.0)
        if w != 0.0:
            vec[term] = w
    norm = math.sqrt(sum(w*w for w in vec.values()))
    doc_tfidf.append(vec)
    doc_norms.append(norm)

# ========== 6. Query vector ==========
def build_query_vector(query: str) -> Tuple[Dict[str, float], float, List[str]]:
    q_counts = Counter(tokenize(query))
    q_vec: Dict[str, float] = {}
    used_terms: List[str] = []
    for term, f in q_counts.items():
        if term in idf_index:
            w = tf(f) * idf_index[term]
            if w != 0.0:
                q_vec[term] = w
                used_terms.append(term)
    q_norm = math.sqrt(sum(w*w for w in q_vec.values()))
    return q_vec, q_norm, used_terms

# ========== 7. Scoring with cosine similarity ==========
def score_documents(query: str, top_k: int = 5):
    if not query.strip():
        return []
    
    q_vec, q_norm, q_terms = build_query_vector(query)
    if q_norm == 0.0:
        return []

    candidate_docs = set()
    for t in q_terms:
        candidate_docs.update(postings.get(t, []))

    results = []
    for doc_id in candidate_docs:
        d_vec = doc_tfidf[doc_id]
        d_norm = doc_norms[doc_id]
        if d_norm == 0.0:
            continue
        dot = sum(q_vec[t] * d_vec.get(t, 0.0) for t in q_vec)
        if dot > 0.0:
            sim = dot / (q_norm * d_norm)
            results.append((doc_id, sim))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]

# ========== 8. Search function ==========
def search(query: str, top_k: int = 5) -> pd.DataFrame:
    results = score_documents(query, top_k=top_k)
    if not results:
        print("No matching documents found.")
        return pd.DataFrame(columns=["Document", "Score"])
    return pd.DataFrame(
        [(doc_names[doc_id], score) for doc_id, score in results],
        columns=["Document", "Score"]
    )

# ========== 9. Demo ==========
if __name__ == "__main__":
    query = input("Enter your search query: ")
    df_results = search(query)
    print("\nTop matching documents:\n", df_results)

Enter your search query:  machine


No matching documents found.

Top matching documents:
 Empty DataFrame
Columns: [Document, Score]
Index: []
