In [97]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import re
import os
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1020)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1020)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1020)>


False

<div style="background:#ff6242;padding:20px;color:#ffffff;margin-top:10px;">
<b>El propósito de esta asignación es que el estudiante ponga en práctica la construcción de sistemas de recuperación de información basado en el modelo clásico de RI Vector Space Model, así como también que evalue la efectividad de estos modelos mediante el uso de una colección de referencia (Benchmark).
<br />
<br />
Para esta práctica estará utilizando el siguiente repositorio:https://github.com/oussbenk/cranfield-trec-dataset. En el mismo encontrarán los archivos  cran.all.1400.xml, cran.qry.xml, cranqrel.trec.txt:
<ul>
<li>cran.all.1400.xml contiene 1,400 resumenes de artículos científicos.</li>
<li>cran.qry.xml 225 términos que representan consultas.</li>
<li>cranqrel.trec.txt contiene los juicios de relevancia a dichas consultas.</li>
    </ul>  
<br />
<br />
Estudie en detalle la estructura y el contenido de este conjunto de documentos provistos antes de comenzar.    
<br />
<br />
En este trabajo, aparte del código, debe proveer una interpretación para cada tarea y un análisis para cada resultado obtenido que así lo amerite.</b>
</div>


## 1. Ejercicio 1
### Puntuación máxima de la tarea: 3 puntos
#### Limpieza y preparación de los datos, utilizando distintas técnicas de las ya vistas en clases. Para esta tarea utilizará el archivo cran.all.1400.xml, específicamente sus columnas title y text.


In [98]:
def wrap_trec_file(file_path):
    with open(file_path, 'r') as f:
        xml_content = f.read()
    return '<root>' + xml_content + '</root>'

In [99]:
import xml.etree.ElementTree as ElementTree
def load_documents():
    xml = wrap_trec_file('cranfield-trec-dataset/cran.all.1400.xml')
    root = ElementTree.fromstring(xml)
    df = pd.DataFrame(columns=['docno', 'title', 'author', 'bib', 'text'])
    for doc in root:
        docno = doc.find('docno').text.strip() if doc.find('docno').text is not None else ''
        title = doc.find('title').text.strip() if doc.find('title').text is not None else ''
        author = doc.find('author').text.strip() if doc.find('author').text is not None else ''
        bib = doc.find('bib').text.strip() if doc.find('bib').text is not None else ''
        text = doc.find('text').text.strip() if doc.find('text').text is not None else ''
        new_row = pd.DataFrame({'docno': [docno], 'title': [title], 'author': [author], 'bib': [bib], 'text': [text]})
        df = pd.concat([df, new_row], ignore_index=True)
    return df
docs = load_documents()

In [100]:
def load_queries(query_file):
    tree = ElementTree.parse(query_file)
    root = tree.getroot()
    queries = []
    query_ids = []
    for topic in root.findall('top'):
        qid = topic.find('num').text.strip()
        title = topic.find('title').text
        queries.append(title)
        query_ids.append(qid)
    return query_ids, queries
query_ids, queries = load_queries('cranfield-trec-dataset/cran.qry.xml')

In [101]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [102]:
def data_cleaning(dataframe):
    dataframe['title'] = dataframe['title'].str.lower()
    dataframe['text'] = dataframe['text'].str.lower()
    dataframe['title'] = dataframe['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    dataframe['text'] = dataframe['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    dataframe['title'] = dataframe['title'].apply(word_tokenize)
    dataframe['text'] = dataframe['text'].apply(word_tokenize)
    stop_words = set(stopwords.words('english'))
    dataframe['title'] = dataframe['title'].apply(lambda x: [word for word in x if word not in stop_words])
    dataframe['text'] = dataframe['text'].apply(lambda x: [word for word in x if word not in stop_words])
    dataframe['title'] = dataframe['title'].apply(lambda x: [token.lemma_ for token in nlp(' '.join(x))])
    dataframe['text'] = dataframe['text'].apply(lambda x: [token.lemma_ for token in nlp(' '.join(x))])
    return dataframe
docs = data_cleaning(docs)

## 2. Ejercicio 2 y 3: Modelos y Evaluación


In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def vectorize(docs, queries):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85)
    doc_vectors = vectorizer.fit_transform(docs)
    query_vectors = vectorizer.transform(queries)
    return doc_vectors, query_vectors, vectorizer

def rank_documents(doc_vectors, query_vector, doc_ids):
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    ranked_indices = np.argsort(similarities)[::-1]
    return [(doc_ids[i], similarities[i]) for i in ranked_indices]

def save_trec_results(query_id, ranked_docs, run_name, output_file):
    with open(output_file, 'a') as f:
        for rank, (doc_id, score) in enumerate(ranked_docs[:100], 1):
            f.write(f"{query_id} Q0 {doc_id} {rank} {score:.4f} {run_name}\n")

def expand_query_with_synsets(query):
    expanded_query_words = []
    for word in query.split():
        expanded_query_words.append(word)
        synsets = wordnet.synsets(word)
        if synsets:
            for lemma in synsets[0].lemmas():
                lemma_name = lemma.name().replace('_', ' ')
                if lemma_name not in expanded_query_words:
                    expanded_query_words.append(lemma_name)
    return ' '.join(expanded_query_words)

def clean_and_expand_queries(queries):
    cleaned_queries = []
    stop_words = set(stopwords.words('english'))
    for q in queries:
        q = q.lower()
        q = re.sub(r'[^\w\s]', '', q)
        tokens = word_tokenize(q)
        q_words = [word for word in tokens if word not in stop_words]
        lemmatized = [token.lemma_ for token in nlp(' '.join(q_words))]
        final_query = ' '.join(lemmatized)
        expanded_query = expand_query_with_synsets(final_query)
        cleaned_queries.append(expanded_query)
    return cleaned_queries

expanded_queries = clean_and_expand_queries(queries)

In [105]:
# Modelo 1: Title-Only (con Expansión de Consulta)
docs_title = docs['title'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
doc_vectors_title, query_vectors_title, _ = vectorize(docs_title, expanded_queries)
output_file_title = "trec_results_title.txt"
if os.path.exists(output_file_title): os.remove(output_file_title)
for i, (qid, query_vector) in enumerate(zip(query_ids, query_vectors_title)):
    ranked_docs = rank_documents(doc_vectors_title, query_vector, docs['docno'])
    save_trec_results(qid, ranked_docs, "title_expanded", output_file_title)

# Modelo 2: Title + Text (con Expansión de Consulta)
docs['title_text'] = docs.apply(lambda row: ' '.join(row['title']) + ' ' + ' '.join(row['text']), axis=1)
docs_title_text = docs['title_text']
doc_vectors_tt, query_vectors_tt, _ = vectorize(docs_title_text, expanded_queries)
output_file_tt = "trec_results_title_text.txt"
if os.path.exists(output_file_tt): os.remove(output_file_tt)
for i, (qid, query_vector) in enumerate(zip(query_ids, query_vectors_tt)):
    ranked_docs = rank_documents(doc_vectors_tt, query_vector, docs['docno'])
    save_trec_results(qid, ranked_docs, "title_text_expanded", output_file_tt)