In [1]:
! pip install rank_bm25
! pip install faiss-cpu
! pip install sentence-transformers

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
[31mERROR: Could not find a version that satisfies the requirement faiss (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss[0m[31m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from to

# 1. Getting the text archive and chunking it

In [2]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock, expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014.
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight.
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""

# Split into a list of sentences
texts = text.split('.')

# Clean up to remove empty spaces and new lines
texts = [t.strip(' \n') for t in texts]

print(len(texts))

15


# Sparse Retrieval

## 1- TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


# Fit vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),
    max_df=0.5,
    min_df=2,
    sublinear_tf=True
)

tfidf_train = vectorizer.fit_transform(texts)

In [5]:
# Keyword extraction for all doc
feature_names = vectorizer.get_feature_names_out()
for idx, vec in enumerate(tfidf_train):
    row = vec.toarray().flatten()
    top_idx = row.argsort()[-5:][::-1]
    keywords = [(feature_names[i], row[i]) for i in top_idx if row[i] > 0]
    print(f"Doc {idx} hot keywords:", [(word, score.item()) for word, score in keywords])

# Semantic search
query = "how precise was the science ?"
q_vec = vectorizer.transform([query])
sims = cosine_similarity(q_vec, tfidf_train)[0]
top_docs = sims.argsort()[-5:][::-1]

print("\nSearch results:")
for idx in top_docs:
    print(f"Doc {idx}: {texts[idx]}")

Doc 0 hot keywords: [('science fiction', 0.38900698689685276), ('fiction', 0.38900698689685276), ('christopher', 0.38900698689685276), ('nolan', 0.38900698689685276), ('science', 0.34715533301401397)]
Doc 1 hot keywords: []
Doc 2 hot keywords: [('film', 1.0)]
Doc 3 hot keywords: [('wrote', 0.5), ('screenplay', 0.5), ('nolan', 0.5), ('christopher', 0.5)]
Doc 4 hot keywords: [('wrote', 0.4843269747278795), ('theoretical', 0.4843269747278795), ('scientific', 0.4843269747278795), ('science', 0.4322202373293343), ('interstellar', 0.33085920131650237)]
Doc 5 hot keywords: [('film', 1.0)]
Doc 6 hot keywords: [('los', 0.5773502691896258), ('los angeles', 0.5773502691896258), ('angeles', 0.5773502691896258)]
Doc 7 hot keywords: [('effects', 0.7802965467309698), ('digital', 0.516414625538216), ('interstellar', 0.352779298839866)]
Doc 8 hot keywords: [('los', 0.4843269747278795), ('los angeles', 0.4843269747278795), ('angeles', 0.4843269747278795), ('2014', 0.4322202373293343), ('interstellar', 0

## 2- BM25

##### BM25 search (lexical search) #####


In [6]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string


def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

In [7]:
from tqdm import tqdm

tokenized_corpus = []
for passage in tqdm(texts):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

100%|██████████| 15/15 [00:00<00:00, 32214.32it/s]


In [8]:
import numpy as np


def keyword_search(query, top_k=5, num_candidates=15):
    print("Input question:", query)

    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    print(f"Top-5 lexical search (BM25) hits")
    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], texts[hit['corpus_id']].replace("\n", " ")))

    return bm25_hits[:top_k]



keyword_search(query = "how precise was the science ?")

Input question: how precise was the science ?
Top-5 lexical search (BM25) hits
	1.789	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.373	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine
	0.000	Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind
	0.000	Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007


[{'corpus_id': np.int64(0), 'score': np.float64(1.788604950756303)},
 {'corpus_id': np.int64(4), 'score': np.float64(1.372650311045535)},
 {'corpus_id': np.int64(1), 'score': np.float64(0.0)},
 {'corpus_id': np.int64(2), 'score': np.float64(0.0)},
 {'corpus_id': np.int64(3), 'score': np.float64(0.0)}]

#  Dense Retrieval

In [11]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# embed text
embeds = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
print("Corpus embeddings shape:", embeds.shape)

# Build the FAISS index
dim = embeds.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeds.astype('float32'))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Corpus embeddings shape: (15, 384)


In [14]:
#  Define search function

def search(query, number_of_results=5):
    # Embed the query
    q_embed = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]

    # Retrieve nearest neighbors
    distances, idxs = index.search(q_embed.reshape(1, -1).astype('float32'), number_of_results)

    # Format into a DataFrame
    results = pd.DataFrame({
        'text': np.array(texts)[idxs[0]],
        'distance': distances[0]
    })
    print(f"Query: “{query}”\nNearest neighbors:")
    return results



# Run search
query = "how precise was the science"
results = search(query, number_of_results=5)
print(results)


Query: “how precise was the science”
Nearest neighbors:
                                                text  distance
0  It has also received praise from many astronom...  1.048788
1  Caltech theoretical physicist and 2017 Nobel l...  1.333017
2  Interstellar uses extensive practical and mini...  1.581987
3  Since its premiere, Interstellar gained a cult...  1.584958
4  Cinematographer Hoyte van Hoytema shot it on 3...  1.712463
