In [1]:
import os
from pathlib import Path

import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter
import pandas as pd
from typing import List
import xml.etree.ElementTree as ET

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def precision(relevance_query: List[int]):
    return sum(relevance_query) / len(relevance_query)

In [3]:
precision([1, 0, 0, 1])

0.5

In [4]:
def precision_at_k(relevance_query: List[int], k: int):
    return sum(relevance_query[:k]) / len(relevance_query[:k])

In [5]:
precision_at_k([1, 0, 0, 1], 2)

0.5

In [6]:
def recall_at_k(relevance_query: List[int], k: int, num_relevant_docs: int):
    return sum(relevance_query[:k]) / num_relevant_docs

In [7]:
recall_at_k([1, 0, 0, 1], 2, 4)

0.25

In [8]:
def average_precision(relevance_query: List[int]):
    cumulative_precision = 0
    relevant_count = 0
    for observation_count, relevance in enumerate(relevance_query, 1):
        if relevance:
            relevant_count += 1
            cumulative_precision += relevant_count / observation_count
    return cumulative_precision / relevant_count


In [9]:
average_precision([0, 1, 0, 1, 1, 1, 1])

0.5961904761904762

In [10]:
def mean_average_precision(relevance_queries: List[List[int]]):
    return sum(average_precision(relevance_query) for relevance_query in relevance_queries) / len(relevance_queries)


In [11]:
mean_average_precision([[1, 0, 1], [0, 1, 1]])

0.7083333333333333

In [12]:
def discounted_cumulative_gain(relevance_query: List[int], k: int):
    return sum(relevance / np.log2(max(i, 2)) for i, relevance in enumerate(relevance_query[:k], 1))

In [13]:
discounted_cumulative_gain([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

10.279642067948915

In [14]:
def normalized_discounted_cumulative_gain(relevance_query: List[int], k: int):
    rq = relevance_query.copy()
    rq.sort(reverse=True)
    return discounted_cumulative_gain(relevance_query, k) / discounted_cumulative_gain(rq, k)

In [15]:
normalized_discounted_cumulative_gain([4, 24, 3, 0, 0, 1, 3, 3, 3, 3, 0], 6)

0.8946463131104694

In [16]:
def token_preprocessing(tokens: List[str]):
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if 'a' <= word[0] <= 'z']
    ps = PorterStemmer()
    return [ps.stem(word) for word in tokens]


def text_preprocessing(text: str):
    tokens = word_tokenize(text)
    return token_preprocessing(tokens)



In [17]:
text_preprocessing(
    "William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born.")

['william',
 'beaumont',
 'physiolog',
 'digest',
 'imag',
 'sourc',
 'novemb',
 'us-american',
 'surgeon',
 'william',
 'beaumont',
 'born']

In [18]:
class Document:
    def __init__(self, text: str, name: str = 'nameless'):
        self.text = text
        self.name = name
        counter = Counter(text_preprocessing(text))
        self.term_counts = pd.Series(counter.values(), index=counter.keys())

    def __repr__(self):
        return str(self)

    def __str__(self):
        return self.name


In [19]:
def load_docs(docs_folder_path: Path):
    files = [f for f in os.listdir(docs_folder_path) if f.endswith('.naf')]
    docs = []
    for file in files:
        tree = ET.parse(os.path.join(docs_folder_path, file))
        root = tree.getroot()
        raw_text = root.find('.//raw').text
        name = file.split('.')[1]
        docs.append(Document(raw_text, name))
    return docs

In [20]:
all_docs = load_docs(Path("./data/docs-raw-texts"))

In [21]:
class BSII:
    def __init__(self, docs: List[Document]):
        self.docs = docs
        self.inverse_index = {}
        for doc in self.docs:
            for term in doc.term_counts.index:
                if term not in self.inverse_index:
                    self.inverse_index[term] = set()
                self.inverse_index[term].add(doc)

    def search(self, query_document: Document = None, excluded_query_document: Document = None):
        relevant_docs = set()
        if query_document is not None:
            for term in query_document.term_counts.index:
                if term in self.inverse_index:
                    relevant_docs.update(self.inverse_index[term])
        else:
            relevant_docs = set(self.docs)
        if excluded_query_document is not None:
            for term in excluded_query_document.term_counts.index:
                if term in self.inverse_index:
                    relevant_docs.difference_update(self.inverse_index[term])
        return relevant_docs

    def evaluate_search(self, queries: List[Document], output_path: Path):
        with open(output_path, 'w') as output_file:
            for query in queries:
                relevant_docs = self.search(query_document=query)
                output_file.write(f"{query.name}\t{','.join(doc.name for doc in relevant_docs)}\n")


In [22]:
bsii = BSII(all_docs)
bsii.search(query_document=Document('Physiology'), excluded_query_document=Document('Swiss'))

{d001, d046, d062, d120, d133, d191, d261, d286, d294, d314}

In [23]:
all_queries = load_docs(Path("./data/queries-raw-texts"))

In [24]:
bsii.evaluate_search(all_queries, 'data/BSII-AND-queries_result')

In [25]:


class RRDV:
    def __init__(self, docs: List[Document]):
        self.docs = docs
        self.term_counts = pd.DataFrame({
            doc.name: doc.term_counts for doc in self.docs
        })
        self.term_counts.fillna(0, inplace=True)
        self.document_count = (self.term_counts >= 1).sum(axis=1)
        self.idf = np.log10(len(self.docs) / self.document_count)
        self.tfidf = np.log10(1 + self.term_counts).mul(self.idf, axis=0)

    @staticmethod
    def cosine_similarity(tfidf_doc_1: pd.Series, tfidf_doc_2: pd.Series | pd.DataFrame):
        return np.dot(tfidf_doc_1, tfidf_doc_2) / (
                np.linalg.norm(tfidf_doc_1) * np.linalg.norm(tfidf_doc_2, axis=0))

    def search(self, query_document: Document, min_similarity: int = 0):
        in_vocab_term_counts = query_document.term_counts[query_document.term_counts.index.isin(self.idf.index)]
        query_tfidf = (np.log10(1 + in_vocab_term_counts) * self.idf).fillna(0)
        results = pd.DataFrame({'similarity': self.cosine_similarity(query_tfidf, self.tfidf), 'doc': self.docs},
                               index=self.tfidf.columns)
        results.sort_values(by='similarity', ascending=False, inplace=True)
        results = results[results['similarity'] > min_similarity]
        return results

    def evaluate_search(self, queries: List[Document], output_path: Path):
        with open(output_path, 'w') as output_file:
            for query in queries:
                relevant_docs = self.search(query_document=query)    
                result_texts = [f'{doc_name}:{row.similarity}' for doc_name, row in relevant_docs.iterrows()]
                output_file.write(f"{query.name}\t{','.join(result_texts)}\n")



In [26]:
rrdv = RRDV(all_docs)

In [27]:
rrdv.cosine_similarity(rrdv.tfidf[all_docs[0].name], rrdv.tfidf[all_docs[0].name])

1.0

In [28]:
rrdv.cosine_similarity(rrdv.tfidf[all_docs[1].name], rrdv.tfidf).shape

(331,)

In [29]:
rrdv.evaluate_search(all_queries, output_path=Path("./data/RRDV-consultas_resultado"))