<a href="https://colab.research.google.com/github/giuliocapecchi/IR_project/blob/main/IR_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
#%pip install torch matplotlib nltk tqdm gdown ir_datasets humanize

# 1. Download and prepare the collection

In [44]:
# chosen_collection can be one of ["vaswani", "msmarco", "covid"]

chosen_collection = "vaswani"

In [45]:
import gdown
import ir_datasets
import pandas as pd
import os

if chosen_collection not in ["vaswani", "msmarco"]:
    raise ValueError("chosen_collection must be one of ['vaswani', 'msmarco']")

if chosen_collection == "msmarco":

    os.makedirs('./collection/msmarco', exist_ok=True)

    url_collection = 'https://drive.google.com/uc?id=1_wXJjiwdgc9Kpt7o7atP8oWe-U4Z56hn'
    
    if not os.path.exists('./collection/msmarco/MSMARCO.tsv'):
        gdown.download(url_collection, './collection/msmarco/MSMARCO.tsv', quiet=False)
    
    """os.makedirs('./pickles', exist_ok=True)
    if not os.path.exists('./pickles/stats.pkl'):
        gdown.download(url_stats, './pickles/stats.pkl', quiet=False)
    if not os.path.exists('./pickles/lex.pkl'):
        gdown.download(url_lex, './pickles/lex.pkl', quiet=False)
    if not os.path.exists('./pickles/inv.pkl'):
        gdown.download(url_inv, './pickles/inv.pkl', quiet=False)
    if not os.path.exists('./pickles/doc.pkl'):
        gdown.download(url_doc, './pickles/doc.pkl', quiet=False)"""

elif chosen_collection == "vaswani":
    os.makedirs('./collection/vaswani', exist_ok=True)

    vaswani_dataset = ir_datasets.load(chosen_collection)
    docs = list(vaswani_dataset.docs_iter())
    df = pd.DataFrame(docs)
    df['doc_id'] = (df['doc_id'].astype(int) - 1).astype(str)
    # rimuovi i \n da ogni documento
    df['text'] = df['text'].str.replace('\n', ' ')
    if not os.path.exists('./collection/vaswani/vaswani.tsv'):
        df.to_csv('./collection/vaswani/vaswani.tsv', sep='\t', header=False, index=False)

Standard preprocessing but with the usage of the *PyStemmer* library.

In [46]:
import re
import string
import nltk
import Stemmer # PyStemmer


nltk.download("stopwords", quiet=True)
STOPWORDS = set(nltk.corpus.stopwords.words("english"))
STEMMER = Stemmer.Stemmer('english')
# stemmer = nltk.stem.PorterStemmer().stem # much slower than PyStemmer


def preprocess(s):
    # lowercasing
    s = s.lower()
    # ampersand and special chars
    s = re.sub(r"[‘’´“”–-]", "'", s.replace("&", " and ")) # this replaces & with 'and' and normalises quotes
    # acronyms
    s = re.sub(r"\.(?!(\S[^. ])|\d)", "", s) # this removes dots that are not part of an acronym
    # remove punctuation
    s = s.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    # strip whitespaces
    s = re.sub(r"\s+", " ", s).strip()
    # tokenisation
    tokens = [t for t in s.split() if t not in STOPWORDS]
    # stemming
    return STEMMER.stemWords(tokens)

In [47]:
import time

def profile(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        ms = (end - start) * 1000
        print(f"{f.__name__} ({ms:.3f} ms)")
        return result
    return f_timer

In [48]:
import pickle
import humanize
import os
from tqdm import tqdm

def print_pickled_size(var_name, var):
    # If the 'tmp' directory does not exist, we first create it
    os.makedirs('./tmp', exist_ok=True)
    with open(f"./tmp/{var_name}.pickle", 'wb') as f:
        pickle.dump(var, f)
    print(f'{var_name} requires {humanize.naturalsize(os.path.getsize(f"./tmp/{var_name}.pickle"))}')
    os.remove(f"./tmp/{var_name}.pickle")
    os.removedirs('./tmp')


def vbyte_encode(number):
    bytes_list = bytearray()
    while True:
        byte = number & 0x7F # Prendi i 7 bit meno significativi -> 0111 1111 = 0x7F
        number >>= 7 # Shifta a destra di 7 bit
        if number:
            bytes_list.append(byte) # Aggiungo i 7 bit al risultato
        else:
            bytes_list.append(0x80 | byte) # Aggiungo i 7 bit con il bit di continuazione, 0x80 = 1000 0000
            break
    return bytes(bytes_list)

def vbyte_decode(bytes_seq):
    number = 0
    for i, byte in enumerate(bytes_seq):
        number |= (byte & 0x7F) << (7 * i)
        if byte & 0x80:
            break
    return number

def decode_concatenated_vbyte(encoded_bytes):
    decoded_numbers = []
    current_number = 0
    shift_amount = 0
    
    for byte in encoded_bytes:
        if byte & 0x80:  # Bit di continuazione trovato, fine del numero
            current_number |= (byte & 0x7F) << shift_amount
            decoded_numbers.append(current_number)
            current_number = 0
            shift_amount = 0
        else:  # Continuo a comporre il numero
            current_number |= (byte & 0x7F) << shift_amount
            shift_amount += 7
    
    return decoded_numbers

#------------------------------------------------------------------------------------------------------------------------------------------------------------------#

def compress_index(lexicon, inv_d, inv_f):    
    compressed_inv_d = {}
    compressed_inv_f = {}

    for term, (termid, df, _) in tqdm(lexicon.items(), desc="Compressing lists", unit="term"):
        encoded_d = bytearray()
        for x in inv_d[termid]:
            encoded_d.extend(vbyte_encode(x)) 
        assert decode_concatenated_vbyte(encoded_d) == inv_d[termid]
        compressed_inv_d[termid] = encoded_d

        encoded_f = bytearray()
        for x in inv_f[termid]:
            encoded_f.extend(vbyte_encode(x))
        assert decode_concatenated_vbyte(encoded_f) == inv_f[termid]
        compressed_inv_f[termid] = encoded_f

    return compressed_inv_d, compressed_inv_f

## Functions to build the inverted index

In [49]:
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm

def build_index(filepath, batch_size=10000):
    total_documents = sum(1 for _ in open(filepath)) # get total number of documents

    lexicon = {}
    inv_d = {}
    inv_f = {}
    doc_index = []
    total_dl = 0
    num_docs = 0
    termid = 0

    with open(filepath, 'r') as file:        
        batch = []
        
        with tqdm(total=total_documents, desc="Processing documents", unit="doc") as pbar:
            for line in file:
                batch.append(line.strip())
                
                # when the batch is full, we process it
                if len(batch) >= batch_size:
                    for line in batch:
                        doc_id, text = line.split('\t', 1) # '1' specifies the number of splits
                        doc_id = int(doc_id)
                        tokens = preprocess(text)
                        token_tf = Counter(tokens)

                        for token, tf in token_tf.items():
                            if token not in lexicon:
                                lexicon[token] = [termid, 0, 0] # termid, df, tf
                                inv_d[termid], inv_f[termid] = [], [] # docids, freqs
                                termid += 1
                            token_id = lexicon[token][0]  # get termid
                            inv_d[token_id].append(doc_id)  # add doc_id to the list of documents containing the term
                            inv_f[token_id].append(tf)  # add term frequency for this doc
                            lexicon[token][1] += 1  # increment document frequency (df)
                            lexicon[token][2] += tf  # increment total term frequency (tf)

                        doclen = len(tokens)
                        doc_index.append((str(doc_id), doclen))
                        total_dl += doclen
                        num_docs += 1                    
                    # update progress bar for each processed document
                    pbar.update(len(batch))
                    batch = []

            # process the remaining documents in the last batch
            if batch:
                for line in batch:
                    doc_id, text = line.split('\t', 1)
                    doc_id = int(doc_id)
                    tokens = preprocess(text)
                    token_tf = Counter(tokens)

                    for token, tf in token_tf.items():
                        if token not in lexicon:
                            lexicon[token] = [termid, 0, 0]
                            inv_d[termid], inv_f[termid] = [], []
                            termid += 1
                        token_id = lexicon[token][0]  # get termid
                        inv_d[token_id].append(doc_id)  # get doc_id to the list of documents containing the term
                        inv_f[token_id].append(tf)  # get term frequency for this doc
                        lexicon[token][1] += 1  # increment document frequency (df)
                        lexicon[token][2] += tf  # increment total term frequency (tf)

                    doclen = len(tokens)
                    doc_index.append((str(doc_id), doclen))
                    total_dl += doclen
                    num_docs += 1                    
                    pbar.update(1)
                    
    stats = {
        'num_docs': num_docs,
        'num_terms': len(lexicon),
        'num_tokens': total_dl,
    }
    return lexicon, {'docids': inv_d, 'freqs': inv_f}, doc_index, stats

In [50]:
import math
import bisect


class InvertedIndex:

    class PostingListIterator:
        def __init__(self, docids, freqs, doc):
            self.docids = docids
            self.freqs = freqs
            self.pos = 0
            self.doc = doc

        def docid(self):
            if self.is_end_list():
                return math.inf
            return self.docids[self.pos]

        def score(self):
            if self.is_end_list():
                return math.inf
            return self.freqs[self.pos]/self.doc[self.docid()][1]

        def next(self, target=None):
            if not target:
                if not self.is_end_list():
                    self.pos += 1
            else:
                if target > self.docid():
                    self.pos = bisect.bisect_left(self.docids, target, self.pos)

        def is_end_list(self):
            return self.pos == len(self.docids)


        def len(self):
            return len(self.docids)
        

    def __init__(self, lex, inv, doc, stats):
        self.lexicon = lex
        self.inv = inv
        self.doc = doc
        self.stats = stats

    def num_docs(self):
        return self.stats['num_docs']

    def get_posting(self, termid):
        return InvertedIndex.PostingListIterator( self.inv['docids'][termid], self.inv['freqs'][termid], self.doc)
    

    def get_termids(self, tokens):
        return [self.lexicon[token][0] for token in tokens if token in self.lexicon]

    def get_postings(self, termids):
        return [self.get_posting(termid) for termid in termids]

In [51]:
# import cProfile
# import pstats

# cProfile.run("build_index('./vaswani.tsv')", "output.prof")
# p = pstats.Stats("output.prof")
# p.sort_stats("cumtime").print_stats(10)
# os.remove("output.prof")

## Building the index on the chosen collection 

Now build up the index for the chosen collection. It is built only if a pickled version of its components doesn't exist already :

In [52]:
import pickle
import os

# If the 'pickles' directory does not exist, we first create it
os.makedirs('./pickles', exist_ok=True)

if chosen_collection == "MSMARCO":
    try: # try to open the pickled files, else build the index
        with open('./pickles/inv_index.pkl', 'rb') as f:
            inv_index = pickle.load(f)
        
    except FileNotFoundError:
        lex, inv, doc, stats = build_index('./collection/'+chosen_collection + '/'+chosen_collection+'.tsv')

        # Save the lexicon, inverted lists, and document index to disk
        with open('./pickles/lex.pkl', 'wb') as f:
            pickle.dump(lex, f)
        with open('./pickles/inv.pkl', 'wb') as f:
            pickle.dump(inv, f)
        with open('./pickles/doc.pkl', 'wb') as f:
            pickle.dump(doc, f)
        with open('./pickles/stats.pkl', 'wb') as f:
            pickle.dump(stats, f)
                    
        # Compress the inverted lists
        #inv['docids'], inv['freqs'] = compress_index(lex, inv['docids'], inv['freqs'])
        
        inv_index = InvertedIndex(lex, inv, doc, stats)
        with open('./pickles/inv_index.pkl', 'wb') as f:
            pickle.dump(inv_index, f)
else:
    lex, inv, doc, stats = build_index('./collection/'+chosen_collection + '/'+chosen_collection+'.tsv')
    inv_index = InvertedIndex(lex, inv, doc, stats)


print(f"Numero di documenti: {inv_index.num_docs()}")

Processing documents:   0%|          | 0/11429 [00:00<?, ?doc/s]

Numero di documenti: 11429


In [53]:
#print_pickled_size('inv_index', inv_index)

# 2. Download and prepare queries

In [54]:
import gzip

if chosen_collection not in ["vaswani", "msmarco"]:
    raise ValueError("chosen_collection must be one of ['vaswani', 'msmarco']")

if chosen_collection == "msmarco":
    if not os.path.exists('./collection/msmarco/msmarco-test2019-queries.tsv'):
        url = 'https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz'
        gdown.download(url, './collection/msmarco/msmarco-test2019-queries.tsv.gz', quiet=False)
        with gzip.open('./collection/msmarco/msmarco-test2019-queries.tsv.gz', 'rt') as f_in:
            with open('./collection/msmarco/msmarco-test2019-queries.tsv', 'w') as f_out:
                f_out.write(f_in.read())
        os.remove('./collection/msmarco/msmarco-test2019-queries.tsv.gz') # delete the compressed file
    queries = pd.read_csv('./collection/msmarco/msmarco-test2019-queries.tsv', sep='\t', header=None)
    queries.columns = ['qid', 'query']
    print("Number of queries: ",len(queries))

    if not os.path.exists('./collection/msmarco/msmarco-test2019-qrels.txt'):
        url = 'https://trec.nist.gov/data/deep/2019qrels-pass.txt'
        gdown.download(url, './collection/msmarco/msmarco-test2019-qrels.txt', quiet=False)
    qrels = pd.read_csv('./collection/msmarco/msmarco-test2019-qrels.txt', sep=' ', header=None)
    qrels.columns = ['qid', 'Q0', 'docid', 'rating']
    print("Number of relevance judgments: ",len(qrels))


elif chosen_collection == "vaswani":
    queries = pd.DataFrame(vaswani_dataset.queries_iter())
    queries.columns = ['qid', 'query']
    print("Number of queries: ",len(list(vaswani_dataset.queries_iter()))) 
    if not os.path.exists('./collection/vaswani/vaswani-queries.tsv'):
        queries.to_csv('./collection/vaswani/vaswani-queries.tsv', sep='\t', header=True, index=False)
    qrels = pd.DataFrame(vaswani_dataset.qrels_iter()) 
    qrels.columns = ['qid', 'docid', 'relevance', 'iteration']
    if not os.path.exists('./collection/vaswani/vaswani-qrels.txt'):
        qrels.to_csv('./collection/vaswani/vaswani-qrels.txt', sep='\t', header=True, index=False)
    print("Number of relevance judgments: ",len(list(vaswani_dataset.qrels_iter())))

Number of queries:  93
Number of relevance judgments:  2083


In [55]:
from collections import namedtuple


class QueriesDataset:
    def __init__(self, df):
        self.queries = [Query(row.query_id, row.text) for row in df.itertuples()]

    def queries_iter(self):
        return iter(self.queries)

    def queries_count(self):
        return len(self.queries)
    
    def get_query(self, query_id):
        return self.queries[query_id]


Query = namedtuple('Query', ['query_id', 'text'])
queries.columns = ['query_id', 'text']
queries_dataset = QueriesDataset(queries)
print("The number of queries is: ", queries_dataset.queries_count())

The number of queries is:  93


Let's prepare the functions necessary to perform TAAT and DAAT query processing

First, we need a TopQueue class, which stores the top  K  (score, docid) tuples, using an heap 

In [56]:
import heapq

class TopQueue:
    def __init__(self, k=10, threshold=0.0):
        self.queue = []
        self.k = k
        self.threshold = threshold

    def size(self):
        return len(self.queue)

    def would_enter(self, score):
        return score > self.threshold

    def clear(self, new_threshold=None):
        self.queue = []
        if new_threshold:
            self.threshold = new_threshold

    def __repr__(self):
        return f'<{self.size()} items, th={self.threshold} {self.queue}'

    def insert(self, docid, score):
        if score > self.threshold:
            if self.size() >= self.k:
                heapq.heapreplace(self.queue, (score, docid))
            else:
                heapq.heappush(self.queue, (score, docid))
            if self.size() >= self.k:
                self.threshold = max(self.threshold, self.queue[0][0])
            return True
        return False

### TAAT

In [57]:
from collections import defaultdict
from functools import lru_cache


def taat(postings, k=10):
    A = defaultdict(float)
    for posting in postings:
        current_docid = posting.docid()
        while current_docid != math.inf:
            A[current_docid] += posting.score()
            posting.next()
            current_docid = posting.docid()
    top = TopQueue(k)
    for docid, score in A.items():
        top.insert(docid, score)
    return sorted(top.queue, reverse=True)


@lru_cache(maxsize=128)
def query_process(query, index):
    qtokens = set(preprocess(query))
    qtermids = index.get_termids(qtokens)
    postings = index.get_postings(qtermids)
    return taat(postings)

### DAAT

In [58]:
import math

def min_docid(postings):
    min_docid = math.inf
    for p in postings:
        if not p.is_end_list():
            min_docid = min(p.docid(), min_docid)
    return min_docid

def daat(postings, k=10):
    top = TopQueue(k)
    current_docid = min_docid(postings)
    while current_docid != math.inf:
        score = 0
        next_docid = math.inf
        for posting in postings:
            if posting.docid() == current_docid:
                score += posting.score()
                posting.next()
            if not posting.is_end_list():
                next_docid = posting.docid()
        top.insert(current_docid, score)
        current_docid = next_docid
    return sorted(top.queue, reverse=True)

def query_process(query, index):
    qtokens = set(preprocess(query))
    qtermids = index.get_termids(qtokens)
    postings = index.get_postings(qtermids)
    return daat(postings)

In [59]:
from tqdm import tqdm


def query_processing(queries_iter, fn):
    for q in tqdm(queries_iter, desc="Processing queries", total=queries_dataset.queries_count(), unit="query"):
        query = preprocess(q.text)
        termids = inv_index.get_termids(query)
        postings = inv_index.get_postings(termids)
        res = fn(postings)

In [None]:
# import cProfile
# import pstats

# cProfile.run("query_processing(queries_dataset.queries_iter(), taat)", "result.prof")
# p = pstats.Stats("./perfm/result.prof")
# p.sort_stats("cumtime").print_stats(25)

In [61]:
query_processing(queries_dataset.queries_iter(), taat)

Processing queries: 100%|██████████| 93/93 [00:00<00:00, 174.37query/s]


In [62]:
query_processing(queries_dataset.queries_iter(), daat)

Processing queries: 100%|██████████| 93/93 [00:01<00:00, 46.84query/s]
