<a href="https://colab.research.google.com/github/giuliocapecchi/IR_project/blob/main/IR_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch matplotlib nltk tqdm gdown

# 1. Download the collection

Con questo modulo si possono scaricare files, quindi ho scaricato la collection e l'ho butttata sul mio drive (ci vogliono circa 30s/1 minuto di tempo)

In [None]:
import gdown

url = 'https://drive.google.com/uc?id=1_wXJjiwdgc9Kpt7o7atP8oWe-U4Z56hn'
gdown.download(url, 'collection.tsv', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1_wXJjiwdgc9Kpt7o7atP8oWe-U4Z56hn
From (redirected): https://drive.google.com/uc?id=1_wXJjiwdgc9Kpt7o7atP8oWe-U4Z56hn&confirm=t&uuid=571c2b83-1cbe-45c6-9c06-74e2cd1d30c6
To: /content/collection.tsv
100%|██████████| 3.06G/3.06G [00:29<00:00, 104MB/s]


'collection.tsv'

In [None]:
"""
read 'collection.tsv' file and prepare it for data manipulation
the file is organized in the following way:
<pid>\t<text>\n
where <pid> is the passage id and <text> is the passage text
"""
import pandas as pd

df = pd.read_csv('collection.tsv', sep='\t', header=None)

In [None]:
# let's not truncate Pandas output too much
pd.set_option('display.max_colwidth', 50) # mettici 150
df.columns = ['doc_id', 'text']
print(df.head(2)) # returns the first N rows

   doc_id                                               text
0       0  The presence of communication amid scientific ...
1       1  The Manhattan Project and its atomic bomb help...


In [None]:
import re
import string
import nltk

nltk.download("stopwords", quiet=True)

def preprocess(s):
    # lowercasing
    s = s.lower()
    # ampersand
    s = s.replace("&", " and ")
    # special chars
    s = s.translate(dict([(ord(x), ord(y)) for x, y in zip("‘’´“”–-", "'''\"\"--")]))
    # acronyms
    s = re.sub(r"\.(?!(\S[^. ])|\d)", "", s) # remove dots that are not part of an acronym
    # remove punctuation
    s = s.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    # strip whitespaces
    s = s.strip()
    while "  " in s:
        s = s.replace("  ", " ")
    # tokeniser
    s = s.split()
    # stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    s = [t for t in s if t not in stopwords]
    # stemming
    stemmer = nltk.stem.PorterStemmer().stem
    s = [stemmer(t) for t in s]
    return s

In [None]:
import time

def profile(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        ms = (end - start) * 1000
        print(f"{f.__name__} ({ms:.3f} ms)")
        return result
    return f_timer

In [None]:
from collections import Counter
from tqdm.auto import tqdm

@profile
def build_index(dataset):
    lexicon = {}
    doc_index = []
    inv_d, inv_f = {}, {}
    termid = 0

    num_docs = 0
    total_dl = 0
    total_toks = 0
    for docid, doc in tqdm(enumerate(dataset.docs_iter()), desc='Indexing', total=dataset.docs_count()):
        tokens = preprocess(doc.text)
        #print(tokens)
        token_tf = Counter(tokens)
        for token, tf in token_tf.items():
            if token not in lexicon:
                lexicon[token] = [termid, 0, 0]
                inv_d[termid], inv_f[termid] =  [], []
                termid += 1
            token_id = lexicon[token][0] # prendo il termid
            inv_d[token_id].append(docid) # aggiungo il docid alla lista dei docid in cui compare il termine
            inv_f[token_id].append(tf) # aggiungo il tf alla lista dei tf in cui compare il termine
            lexicon[token][1] += 1 # incremento il df
            lexicon[token][2] += tf # tf è quanto compare il termine nel documento
        doclen = len(tokens)
        doc_index.append((str(doc.doc_id), doclen))
        total_dl += doclen
        num_docs += 1


    stats = {
        'num_docs': 1 + docid, # docid parte da 0
        'num_terms': len(lexicon),
        'num_tokens': total_dl,
    }
    return lexicon, {'docids': inv_d, 'freqs': inv_f}, doc_index, stats

In [None]:
"""
This class that takes the dataframe we created before with columns 'docno' and 'text', and creates a list of namedtuples
"""
from collections import namedtuple


class MSMarcoDataset:
    def __init__(self, df):
        self.docs = [Document(row.doc_id, row.text) for row in df.itertuples()]

    def docs_iter(self):
        return iter(self.docs)

    def docs_count(self):
        return len(self.docs)


Document = namedtuple('Document', ['doc_id', 'text']) # must define what a document is

In [None]:
# Test the MSMarcoDataset class by passing Document(1, "school"), Document(2, "example."), Document(3, "house.")

test_docs = [Document(1, "A school"), Document(2, "Another example."), Document(3, "This is a house.")]
test_dataset = MSMarcoDataset(pd.DataFrame(test_docs, columns=['doc_id', 'text']))

for doc in test_dataset.docs_iter():
    print(doc)

lex, inv, doc, stats = build_index(test_dataset)


Document(doc_id=1, text='A school')
Document(doc_id=2, text='Another example.')
Document(doc_id=3, text='This is a house.')


Indexing:   0%|          | 0/3 [00:00<?, ?it/s]

build_index (52.832 ms)


In [None]:
print(lex)
print(inv)
print(doc)
print(stats)

{'school': [0, 1, 1], 'anoth': [1, 1, 1], 'exampl': [2, 1, 1], 'hous': [3, 1, 1]}
{'docids': {0: [0], 1: [1], 2: [1], 3: [2]}, 'freqs': {0: [1], 1: [1], 2: [1], 3: [1]}}
[('1', 1), ('2', 2), ('3', 1)]
{'num_docs': 3, 'num_terms': 4, 'num_tokens': 4}


In [None]:
# create a df with the first 10 rows
df = df.head(10) # TODO : REMOVE THIS

dataset = MSMarcoDataset(df)
lex, inv, doc, stats = build_index(dataset)

Indexing:   0%|          | 0/10 [00:00<?, ?it/s]

build_index (61.623 ms)


In [None]:
print(lex)
print(inv)
print(doc)
print(stats)

{'presenc': [0, 1, 1], 'commun': [1, 1, 1], 'amid': [2, 1, 1], 'scientif': [3, 1, 2], 'mind': [4, 1, 1], 'equal': [5, 1, 1], 'import': [6, 1, 1], 'success': [7, 2, 3], 'manhattan': [8, 10, 13], 'project': [9, 10, 18], 'intellect': [10, 1, 1], 'cloud': [11, 1, 1], 'hang': [12, 1, 1], 'impress': [13, 1, 1], 'achiev': [14, 1, 1], 'atom': [15, 7, 9], 'research': [16, 2, 2], 'engin': [17, 3, 3], 'truli': [18, 1, 1], 'meant': [19, 1, 1], 'hundr': [20, 1, 1], 'thousand': [21, 1, 1], 'innoc': [22, 1, 1], 'live': [23, 1, 1], 'obliter': [24, 1, 1], 'bomb': [25, 7, 7], 'help': [26, 1, 1], 'bring': [27, 1, 1], 'end': [28, 2, 2], 'world': [29, 5, 5], 'war': [30, 4, 4], 'ii': [31, 4, 4], 'legaci': [32, 1, 1], 'peac': [33, 1, 1], 'use': [34, 1, 1], 'energi': [35, 1, 1], 'continu': [36, 1, 1], 'impact': [37, 1, 1], 'histori': [38, 2, 4], 'scienc': [39, 1, 1], 'essay': [40, 1, 1], 'see': [41, 1, 1], 'make': [42, 1, 2], 'possibl': [43, 1, 1], 'would': [44, 1, 1], 'forev': [45, 1, 2], 'chang': [46, 1, 1]