In [None]:
import ir_datasets

import numpy as np
import os
import gzip
import re

from sklearn.feature_extraction.text import TfidfVectorizer,  ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi  # lightweight BM25 implementation

In [None]:
dataset = ir_datasets.load("trec-robust04")

#### Features

Overlap between query and title/body

In [None]:
#count how many unique query terms appear in the title
def keyword_overlap_title_unique(query, doc):
    query_terms = set(re.findall(r'\w+', query.title.lower()))
    doc_terms = set(re.findall(r'\w+', doc.title.lower())) if doc.title else set()
    return len(query_terms.intersection(doc_terms))

#count how many query terms appear in the title in total
def keyword_overlap_title_total(query, doc):
    query_terms = re.findall(r'\w+', query.title.lower())
    doc_terms = re.findall(r'\w+', doc.title.lower()) if doc.title else []

    count = 0
    for term in query_terms:
        count += doc_terms.count(term)
    return count

In [None]:
#count how many unique query terms appear in the body
def keyword_overlap_body_unique(query, doc):
    query_terms = set(re.findall(r'\w+', query.title.lower()))
    doc_terms = set(re.findall(r'\w+', doc.text.lower()))
    return len(query_terms.intersection(doc_terms))

#count how many query terms appear in the body in total
def keyword_overlap_body_total(query, doc):
    query_terms = re.findall(r'\w+', query.title.lower())
    doc_terms = re.findall(r'\w+', doc.text.lower())

    count = 0
    for term in query_terms:
        count += doc_terms.count(term)
    return count

In [None]:
def keyword_overlap_title_stopwords(query, doc):
    query_terms = set(re.findall(r'\w+', query.title.lower()))
    doc_terms = set(re.findall(r'\w+', doc.title.lower()))

    # Remove stopwords
    query_terms -= set(ENGLISH_STOP_WORDS)
    doc_terms -= set(ENGLISH_STOP_WORDS)

    return len(query_terms.intersection(doc_terms))

def keyword_overlap_body_stopwords(query, doc):
    query_terms = set(re.findall(r'\w+', query.title.lower()))
    doc_terms = set(re.findall(r'\w+', doc.text.lower()))

    # Remove stopwords
    query_terms -= set(ENGLISH_STOP_WORDS)
    doc_terms -= set(ENGLISH_STOP_WORDS)

    return len(query_terms.intersection(doc_terms))

Absolute lengths

In [None]:
def query_length(query):
    tokens = re.findall(r'\w+', query.title)
    return len(tokens)

def document_length(doc):
    tokens = re.findall(r'\w+', doc.text)
    return len(tokens)

Cosine similarity

In [None]:
def tfidf_cosine_similarity_body(query, doc, vectorizer):
    query_vec = vectorizer.transform([query.title])
    doc_vec = vectorizer.transform([doc.text])

    cos_sim = cosine_similarity(query_vec, doc_vec)[0][0]
    return float(cos_sim)

bm25 score

In [None]:
def bm25_score(query, bm25_model):
    query_tokens = re.findall(r'\w+', query.title.lower())

    scores = bm25_model.get_scores(query_tokens)
    return scores #bm25 values for query for all documents (in order of tokenized corpus)

#### Using the features  (example for query 1)

Formatting the FBIS dataset:

In [None]:
base_path = "data/FBIS" #put correct path here

raw_contents = []
for fname in sorted(os.listdir(base_path)):
    if fname.endswith(".gz"):
        with gzip.open(os.path.join(base_path, fname), 'rt', encoding='latin-1') as f:
            content = f.read()
        docs = re.findall(r"<DOC>.*?</DOC>", content, re.DOTALL)
        raw_contents.extend(docs)

print(f"Amount of documents: {len(raw_contents)}\n")

In [None]:
from dataclasses import dataclass

@dataclass
class Document:
    id: str
    title: str
    text: str
    date: str
    raw: str

In [None]:
structured_docs = []

for raw_doc in raw_contents:
    docno_match = re.search(r"<DOCNO>(.*?)</DOCNO>", raw_doc)
    docno = docno_match.group(1).strip() if docno_match else None

    title_match = re.search(r"<TI>(.*?)</TI>", raw_doc, re.DOTALL | re.IGNORECASE)
    title = title_match.group(1).strip() if title_match else None

    text_blocks = re.findall(r"<TEXT>(.*?)</TEXT>", raw_doc, re.DOTALL | re.IGNORECASE)
    text = "\n".join(t.strip() for t in text_blocks) if text_blocks else None

    date = re.search(r"<DATE1>(.*?)</DATE1>", raw_doc, re.DOTALL)
    date = date.group(1).strip() if date else None

    doc = Document(id=docno, raw=raw_doc, title=title, text=text, date=date)
    structured_docs.append(doc)

n=0
print(structured_docs[n].id)
print(structured_docs[n].title)
print(structured_docs[n].date)
print(structured_docs[n].text)

Example query and docs

In [None]:
query1 = next(dataset.queries_iter())
print(query1.query_id)
print(query1.title)
print(query1.description)
len_query1 = len(query1.title.split())

doc1 = structured_docs[0]
doc2 = structured_docs[1]
doc3 = structured_docs[2]

test_docs = [doc1, doc2, doc3]

Overlapping words

In [None]:
for doc in structured_docs:
    amount = keyword_overlap_title_unique(query1, doc)
    if amount >= len_query1-1:
        print(doc.title)


In [None]:
for doc in structured_docs:
    num = keyword_overlap_body_unique(query1, doc)
    if num == len_query1:
        print(f"#{num} - {doc.title}")


In [None]:
#print a document with more than 100 overlapping terms
amount = 100
for doc in structured_docs:
    num = keyword_overlap_body_total(query1, doc, amount)
    if num > amount:
        print("Doc ID: ", doc.id)
        print("Title: ", doc.title)
        print("Amount: ", num)
        print("Text: \n", doc.text)
        break # only print the first one with more than 100

absolute lengths

In [None]:
print(query_length(query1))

for doc in test_docs:
    print(document_length(doc))

cosine similarity

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit([doc.text for doc in structured_docs])

In [None]:
#print all with a cosine similarity > 0.4
for idx, doc in enumerate(structured_docs):
    if idx % 1000 == 0:
        print(idx)
    cos = tfidf_cosine_similarity_body(query1, doc, vectorizer)
    if cos > 0.4:
        print(f"cos: {cos}")
        print(f"doc id: {doc.id}")
        print(f"doc title: {doc.title}")
        print("-"*40)

BM-25 score

In [None]:
all_docs = structured_docs
all_text = [doc.text for doc in all_docs]

tokenized_corpus = [re.findall(r'\w+', text.lower()) for text in all_text]

In [None]:
bm25_model = BM25Okapi(tokenized_corpus)

In [None]:
scored_documents = []
all_bm25_scores = bm25_score(query1, bm25_model)

In [None]:
#showing the top 5 most similar ones accoring to bm25:
for i, score in enumerate(all_bm25_scores):
    doc_object = structured_docs[i]
    scored_documents.append({'doc': doc_object, 'bm25 score': score})

ranked_results = sorted(scored_documents, key=lambda x: x['bm25 score'], reverse=True)

for i in range(5):
    print(ranked_results[i]['bm25 score'])
    print(ranked_results[i]['doc'].id)
    print(ranked_results[i]['doc'].title)
    print(ranked_results[i]['doc'].text)