<a href="https://colab.research.google.com/github/giuliocapecchi/IR_project/blob/main/IR_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install torch matplotlib nltk tqdm gdown ir_datasets humanize 

# 1. Download and prepare the collection

In [1]:
# chosen_collection can be one of ["vaswani", "MSMARCO"]

chosen_collection = "vaswani"

In [None]:
import gdown
import ir_datasets
import pandas as pd
import os

if chosen_collection not in ["vaswani", "MSMARCO"]:
    raise ValueError("chosen_collection must be one of ['vaswani', 'MSMARCO']")

if chosen_collection == "MSMARCO":
    url_collection = 'https://drive.google.com/uc?id=1_wXJjiwdgc9Kpt7o7atP8oWe-U4Z56hn'
    url_stats = 'https://drive.google.com/uc?id=1ruBLKGkKvfFlPMdjVUGkR09_f-528fEz'
    url_lex = 'https://drive.google.com/uc?id=151vGXMMwslHM-Vv8vKRATNLazyAXDM1A'
    url_inv = 'https://drive.google.com/uc?id=1UUL0SkjFq4V9tNGiabRo27GfH3XsUuyk'
    url_doc = 'https://drive.google.com/uc?id=1p-AChVgbUN4nIzFO55-pm8CtCAghobWC'
    if not os.path.exists('collection.tsv'):
        gdown.download(url_collection, 'collection.tsv', quiet=False)
    
    os.makedirs('./pickles', exist_ok=True)
    if not os.path.exists('./pickles/stats.pkl'):
        gdown.download(url_stats, './pickles/stats.pkl', quiet=False)
    if not os.path.exists('./pickles/lex.pkl'):
        gdown.download(url_lex, './pickles/lex.pkl', quiet=False)
    if not os.path.exists('./pickles/inv.pkl'):
        gdown.download(url_inv, './pickles/inv.pkl', quiet=False)
    if not os.path.exists('./pickles/doc.pkl'):
        gdown.download(url_doc, './pickles/doc.pkl', quiet=False)

    df = pd.read_csv('collection.tsv', sep='\t', header=None, names=['doc_id', 'text'])

elif chosen_collection == "vaswani":
    vaswani_dataset = ir_datasets.load("vaswani")
    docs = list(vaswani_dataset.docs_iter())
    df = pd.DataFrame(docs)
    df['doc_id'] = (df['doc_id'].astype(int) - 1).astype(str)

In [None]:
#let's not truncate Pandas output too much
pd.set_option('display.max_colwidth', 50) # mettici 150


print(df.head(5)) # returns the first N rows

In [None]:
import re
import string
import nltk


nltk.download("stopwords", quiet=True)
STOPWORDS = set(nltk.corpus.stopwords.words("english"))
STEMMER = nltk.stem.PorterStemmer()

def preprocess(s):
    # lowercasing
    s = s.lower()
    # ampersand and special chars
    s = re.sub(r"[‘’´“”–-]", "'", s.replace("&", " and ")) # this replaces & with 'and' and normalises quotes
    # acronyms
    s = re.sub(r"\.(?!(\S[^. ])|\d)", "", s) # this removes dots that are not part of an acronym
    # remove punctuation
    s = s.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    # strip whitespaces
    s = re.sub(r"\s+", " ", s).strip()
    # tokenisation
    tokens = [t for t in s.split() if t not in STOPWORDS]
    # stemming
    tokens = [STEMMER.stem(t) for t in tokens]
    return tokens

In [None]:
import time

def profile(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        ms = (end - start) * 1000
        print(f"{f.__name__} ({ms:.3f} ms)")
        return result
    return f_timer

In [None]:
import pickle
import humanize
import os

def print_pickled_size(var_name, var):
    # If the 'tmp' directory does not exist, we first create it
    os.makedirs('./tmp', exist_ok=True)
    with open(f"./tmp/{var_name}.pickle", 'wb') as f:
        pickle.dump(var, f)
    print(f'{var_name} requires {humanize.naturalsize(os.path.getsize(f"./tmp/{var_name}.pickle"))}')
    os.remove(f"./tmp/{var_name}.pickle")
    os.removedirs('./tmp')


def vbyte_encode(number):
    bytes_list = bytearray()
    while True:
        byte = number & 0x7F # Prendi i 7 bit meno significativi -> 0111 1111 = 0x7F
        number >>= 7 # Shifta a destra di 7 bit
        if number:
            bytes_list.append(byte) # Aggiungo i 7 bit al risultato
        else:
            bytes_list.append(0x80 | byte) # Aggiungo i 7 bit con il bit di continuazione, 0x80 = 1000 0000
            break
    return bytes(bytes_list)

def vbyte_decode(bytes_seq):
    number = 0
    for i, byte in enumerate(bytes_seq):
        number |= (byte & 0x7F) << (7 * i)
        if byte & 0x80:
            break
    return number

def decode_concatenated_vbyte(encoded_bytes):
    decoded_numbers = []
    current_number = 0
    shift_amount = 0
    
    for byte in encoded_bytes:
        if byte & 0x80:  # Bit di continuazione trovato, fine del numero
            current_number |= (byte & 0x7F) << shift_amount
            decoded_numbers.append(current_number)
            current_number = 0
            shift_amount = 0
        else:  # Continuo a comporre il numero
            current_number |= (byte & 0x7F) << shift_amount
            shift_amount += 7
    
    return decoded_numbers

In [None]:
# from collections import namedtuple


# class MSMarcoDataset:
#     """
#     This class that takes the dataframe we created before with columns 'docno' and 'text', and creates a list of namedtuples
#     """
#     def __init__(self, df):
#         self.docs = [Document(row.doc_id, row.text) for row in df.itertuples()]

#     def docs_iter(self):
#         return iter(self.docs)

#     def docs_count(self):
#         return len(self.docs)

# Document = namedtuple('Document', ['doc_id', 'text']) # must define what a document is


# from collections import Counter
# from tqdm.auto import tqdm

# @profile
# def old_build_index(dataset):
#     lexicon = {}
#     doc_index = []
#     inv_d, inv_f = {}, {}
#     termid = 0

#     num_docs = 0
#     total_dl = 0
#     total_toks = 0
#     for docid, doc in tqdm(enumerate(dataset.docs_iter()), desc='Indexing', total=dataset.docs_count()):
#         tokens = preprocess(doc.text)
#         #print(tokens)
#         token_tf = Counter(tokens)
#         for token, tf in token_tf.items():
#             if token not in lexicon:
#                 lexicon[token] = [termid, 0, 0]
#                 inv_d[termid], inv_f[termid] =  [], []
#                 termid += 1
#             token_id = lexicon[token][0] # prendo il termid
#             inv_d[token_id].append(docid) # aggiungo il docid alla lista dei docid in cui compare il termine
#             inv_f[token_id].append(tf) # aggiungo il tf alla lista dei tf in cui compare il termine
#             lexicon[token][1] += 1 # incremento il df
#             lexicon[token][2] += tf # tf è quanto compare il termine nel documento
#         doclen = len(tokens)
#         doc_index.append((str(doc.doc_id), doclen))
#         total_dl += doclen
#         num_docs += 1

#     # Compress the inv_d and inv_f lists
#     for term, (termid, df, _) in lexicon.items():
#         # Compress the docids
#         encoded_list = [vbyte_encode(x) for x in inv_d[termid]]
#         concatenated_encoded = b''.join(encoded_list)
#         assert inv_d[termid] == decode_concatenated_vbyte(concatenated_encoded), "Compression/Decompression mismatch!"
#         inv_d[termid] = concatenated_encoded
#         # Compress the frequencies
#         encoded_list = [vbyte_encode(x) for x in inv_f[termid]]
#         concatenated_encoded = b''.join(encoded_list)
#         assert inv_f[termid] == decode_concatenated_vbyte(concatenated_encoded), "Compression/Decompression mismatch!"
#         inv_f[termid] = concatenated_encoded
        

    
#     stats = {
#         'num_docs': 1 + docid, # docid starts from 0
#         'num_terms': len(lexicon),
#         'num_tokens': total_dl,
#     }
#     return lexicon, {'docids': inv_d, 'freqs': inv_f}, doc_index, stats


# dataset = MSMarcoDataset(df)

# expected_lexicon, expected_inv, expected_doc_index, expected_stats = old_build_index(dataset)

In [None]:
import pandas as pd
from collections import Counter
from joblib import Parallel, delayed
from tqdm import tqdm

def compress_and_encode(termid, doc_ids, freqs):
    encoded_docs = [vbyte_encode(x) for x in doc_ids]
    concatenated_docs = b''.join(encoded_docs)
    assert doc_ids == decode_concatenated_vbyte(concatenated_docs), "Compression/Decompression mismatch!"
    
    encoded_freqs = [vbyte_encode(x) for x in freqs]
    concatenated_freqs = b''.join(encoded_freqs)
    assert freqs == decode_concatenated_vbyte(concatenated_freqs), "Compression/Decompression mismatch!"
    
    return termid, concatenated_docs, concatenated_freqs

def process_batch(batch):
    results = []
    for _, row in batch.iterrows(): # iterate on batch rows
        docid, text = row['doc_id'], row['text']
        docid = int(docid)
        try:
            tokens = preprocess(text)
            token_tf = Counter(tokens)
            doclen = len(tokens)
            results.append((docid, token_tf, doclen))
        except Exception as e:
            print(f"Error processing document {docid}: {e}")
            results.append(None)
    return results

def build_index(df, batch_size=10000):
    lexicon = {}
    doc_index = []
    inv_d, inv_f = {}, {}
    termid = 0

    num_docs = 0
    total_dl = 0

    batches = [df.iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    # free df from memory
    df = None

    num_cores = -1  # utilize all available cores

    results = Parallel(n_jobs=num_cores)(
        delayed(process_batch)(batch) for batch in tqdm(batches, desc="Processing documents")
    )

    # now we can iterate over the results
    for result in results:
        for r in result:
            if r is None:
                print("Skipping document due to error")
                continue

            docid, token_tf, doclen = r

            for token, tf in token_tf.items():
                if token not in lexicon:
                    lexicon[token] = [termid, 0, 0]
                    inv_d[termid], inv_f[termid] = [], []
                    termid += 1
                token_id = lexicon[token][0]
                inv_d[token_id].append(docid)
                inv_f[token_id].append(tf)
                lexicon[token][1] += 1
                lexicon[token][2] += tf

            doc_index.append((str(docid), doclen))
            total_dl += doclen
            num_docs += 1

    # parallel compression of inv_d and inv_f lists
    compress_data = [(token, lexicon[token][0], inv_d[lexicon[token][0]], inv_f[lexicon[token][0]]) for token in lexicon.keys()]

    compressed_results = Parallel(n_jobs=num_cores)(
        delayed(compress_and_encode)(data[1], data[2], data[3]) for data in tqdm(compress_data, desc="Compressing data")
    )

    for termid, compressed_docs, compressed_freqs in compressed_results:
        inv_d[termid] = compressed_docs
        inv_f[termid] = compressed_freqs

    stats = {
        'num_docs': num_docs,
        'num_terms': len(lexicon),
        'num_tokens': total_dl,
    }
    return lexicon, {'docids': inv_d, 'freqs': inv_f}, doc_index, stats

In [None]:
import math

class InvertedIndex:

    class PostingListIterator:
        def __init__(self, docids, freqs, doc):
            self.docids = docids
            self.freqs = freqs
            self.pos = 0
            self.doc = doc

        def docid(self):
            if self.is_end_list():
                return math.inf
            return self.docids[self.pos]

        def score(self):
            if self.is_end_list():
                return math.inf
            return self.freqs[self.pos]/self.doc[self.docid()][1]

        def next(self, target = None):
            if not target:
                if not self.is_end_list():
                    self.pos += 1
            else:
                if target > self.docid():
                    try:
                        self.pos = self.docids.index(target, self.pos)
                    except ValueError:
                        self.pos = len(self.docids)

        def is_end_list(self):
            return self.pos == len(self.docids)


        def len(self):
            return len(self.docids)


    def __init__(self, lex, inv, doc, stats):
        self.lexicon = lex
        self.inv = inv
        self.doc = doc
        self.stat = stats

    def num_docs(self):
        return self.stats['num_docs']

    def get_posting(self, termid):
        # Extract the encoded docids and freqs
        docids_encoded = self.inv['docids'][termid]
        freqs_encoded = self.inv['freqs'][termid]
        # Decode the docids and freqs before returning the iterator
        docids = decode_concatenated_vbyte(docids_encoded)
        freqs = decode_concatenated_vbyte(freqs_encoded)
    
        return InvertedIndex.PostingListIterator(docids, freqs, self.doc)
    

    def get_termids(self, tokens):
        return [self.lexicon[token][0] for token in tokens if token in self.lexicon]

    def get_postings(self, termids):
        return [self.get_posting(termid) for termid in termids]

Now build up the index for the chosen collection. It is built only if a pickled version of its components doesn't exist already :

In [None]:
import pickle
import os

# If the 'pickles' directory does not exist, we first create it
os.makedirs('./pickles', exist_ok=True)

try: # try to open the pickled files, else build the index
    with open('./pickles/lex.pkl', 'rb') as f:
        lex = pickle.load(f)
    with open('./pickles/inv.pkl', 'rb') as f:
        inv = pickle.load(f)
    with open('./pickles/doc.pkl', 'rb') as f:
        doc = pickle.load(f)
    with open('./pickles/stats.pkl', 'rb') as f:
        stats = pickle.load(f)
    print("Index loaded from pickles")

except:
    #TODO : rimuovi -> lex, inv, doc, stats = build_index(dataset)
    lex, inv, doc, stats = build_index(df)

    # pickle lex, inv, doc, stats
    with open('./pickles/lex.pkl', 'wb') as f:
        pickle.dump(lex, f)

    with open('./pickles/inv.pkl','wb') as f:
        pickle.dump(inv, f)

    with open('./pickles/doc.pkl', 'wb') as f:
        pickle.dump(doc, f)

    with open('./pickles/stats.pkl', 'wb') as f:
        pickle.dump(stats, f)

    print("Index built and pickled")


print_pickled_size('lex', lex)
print_pickled_size('inv', inv)
print_pickled_size('doc', doc)
print_pickled_size('stats', stats)

In [None]:
# try:
#     assert lex == expected_lexicon, "Lexicon does not match expected"
    
#     # Ordinare le liste prima di confrontarle
#     # assert sorted(inv['docids']) == sorted(expected_inv['docids']), "Inverted document index does not match expected"
#     # assert sorted(inv['freqs']) == sorted(expected_inv['freqs']), "Inverted frequencies do not match expected"
    
#     assert inv['docids'] == expected_inv['docids'], "Inverted document index does not match expected"
#     assert inv['freqs'] == expected_inv['freqs'], "Inverted frequencies do not match expected"
    
    
    
#     # Ordinare l'indice dei documenti se è una lista
#     assert doc == expected_doc_index, "Document index does not match expected"
    
#     # Confronto per le statistiche, se sono dizionari o strutture simili
#     assert stats == expected_stats, "Stats do not match expected"
    
# except AssertionError as e:
#     print(e)

In [None]:
def print_index(lex, inv, doc, stats): 
    print("**********************************************************************************\nFirst 5 elements of lexicon:")
    print(list(lex.items())[:5])

    print("\nFirst 5 elements of inv:\n\tFirst 5 elements of inverted index docids:")
    print("\t",{k: decode_concatenated_vbyte(v)[:5] for k, v in sorted(list(inv['docids'].items()))[:5]})
    print("\tFirst 5 elements of inverted index freqs:")
    print("\t",{k: decode_concatenated_vbyte(v)[:5] for k, v in list(inv['freqs'].items())[:5]})

    print("\nFirst 5 elements of document index:")
    print(doc[:5])

    print("\nStats:")
    print(stats,"\n**********************************************************************************")

print_index(lex, inv, doc, stats)
#print_index(expected_lexicon, expected_inv, expected_doc_index, expected_stats)

We have all the elements necessary to build the Inverted Index

In [None]:
inv_index = InvertedIndex(lex, inv, doc, stats)

print_pickled_size('inv_index', inv_index)

---

# 2. Download and prepare queries

Aggiungiamo anche le query per il dataset scelto

In [None]:
if chosen_collection == "MSMARCO":
    if not os.path.exists('queries.tar.gz'):
        url = 'https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz'
        gdown.download(url, 'queries.tar.gz', quiet=False)
        !tar -xzf queries.tar.gz
    queries = pd.read_csv('queries.eval.tsv', sep='\t', header=None)
    print("Number of queries: ",len(queries))
elif chosen_collection == "vaswani":
    # Converte le query in un DataFrame
    queries = pd.DataFrame(vaswani_dataset.queries_iter())
    queries.columns = ['qid', 'query']
    print(queries.head(2))
    print("Number of queries: ",len(list(vaswani_dataset.queries_iter()))) 

In [None]:
print(queries.head(5))

In [None]:
from collections import namedtuple


class MSMarcoQueries:
    def __init__(self, df):
        self.queries = [Query(row.query_id, row.text) for row in df.itertuples()]

    def queries_iter(self):
        return iter(self.queries)

    def queries_count(self):
        return len(self.queries)


Query = namedtuple('Query', ['query_id', 'text'])


queries.columns = ['query_id', 'text']
queries_dataset = MSMarcoQueries(queries)
print("The number of queries is: ", queries_dataset.queries_count())

Let's prepare the functions necessary to perform TAAT and DAAT query processing

First, we need a TopQueue class, which stores the top  K  (score, docid) tuples, using an heap 

In [None]:
import heapq

class TopQueue:
    def __init__(self, k=10, threshold=0.0):
        self.queue = []
        self.k = k
        self.threshold = threshold

    def size(self):
        return len(self.queue)

    def would_enter(self, score):
        return score > self.threshold

    def clear(self, new_threshold=None):
        self.queue = []
        if new_threshold:
            self.threshold = new_threshold

    def __repr__(self):
        return f'<{self.size()} items, th={self.threshold} {self.queue}'

    def insert(self, docid, score):
        if score > self.threshold:
            if self.size() >= self.k:
                heapq.heapreplace(self.queue, (score, docid))
            else:
                heapq.heappush(self.queue, (score, docid))
            if self.size() >= self.k:
                self.threshold = max(self.threshold, self.queue[0][0])
            return True
        return False

### TAAT

In [None]:
from collections import defaultdict

def taat(postings, k=10):
    A = defaultdict(float)
    for posting in postings:
        current_docid = posting.docid()
        while current_docid != math.inf:
            A[current_docid] += posting.score()
            posting.next()
            current_docid = posting.docid()
    top = TopQueue(k)
    for docid, score in A.items():
        top.insert(docid, score)
    return sorted(top.queue, reverse=True)

def query_process(query, index):
    qtokens = set(preprocess(query))
    qtermids = index.get_termids(qtokens)
    postings = index.get_postings(qtermids)
    return taat(postings)

### DAAT

In [None]:
import math

def min_docid(postings):
    min_docid = math.inf
    for p in postings:
        if not p.is_end_list():
            min_docid = min(p.docid(), min_docid)
    return min_docid

def daat(postings, k=10):
    top = TopQueue(k)
    current_docid = min_docid(postings)
    while current_docid != math.inf:
        score = 0
        next_docid = math.inf
        for posting in postings:
            if posting.docid() == current_docid:
                score += posting.score()
                posting.next()
            if not posting.is_end_list():
                next_docid = posting.docid()
        top.insert(current_docid, score)
        current_docid = next_docid
    return sorted(top.queue, reverse=True)

def query_process(query, index):
    qtokens = set(preprocess(query))
    qtermids = index.get_termids(qtokens)
    postings = index.get_postings(qtermids)
    return daat(postings)

In [None]:
@profile
def query_processing(queries_iter, fn):
    for q in queries_iter:
        query = preprocess(q.text)
        termids = inv_index.get_termids(query)
        postings = inv_index.get_postings(termids)
        res = fn(postings)

In [None]:
query_processing(queries_dataset.queries_iter(), taat)

In [None]:
query_processing(queries_dataset.queries_iter(), daat)