## Recuperación de Información
# Práctica 1 &mdash; Motores de búsqueda
### Autores:

Óscar Calvet \\
Enrique Ernesto de Alvear

# 1. Funciones de ranking no supervisado.

Dada una pequeña colección "de juguete", calcular el ranking de búsqueda para varias consultas según las siguientes funciones de ranking no supervisado:

<ol type="a">
<li> Coseno TF-IDF (modelo vectorial).
<li> BM25.
<li> Query likelihood.
</ol>

### Definir las funciones de ranking a continuación.

In [None]:
from math import sqrt, log2
import numpy as np
class VSM:
  def __init__(self, freqvector, docfreqs):
    self.freqvector = freqvector
    self.docfreqs = docfreqs

  def search(self, q):
    # Calculamos los cosenos de todos los documentos.
    ranking = [(url, self.dotproduct(url, q) / self.module(url)) for url in self.freqvector]
    # Eliminamos los documentos con coseno = 0.
    ranking = [(url, cos) for url, cos in ranking if cos > 0]
    # Ordenamos.
    ranking.sort(key=lambda x: x[1], reverse=True)
    return ranking

  def dotproduct(self, url, q):
    result = 0
    for word in q:
      result += self.tf(word, url) * self.idf(word)
    return result


  def module(self, url):
    result = 0
    for word in self.freqvector[url]:
      result += self.freqvector[url][word]**2
    return sqrt(result)



  def tf(self, word, url):
    if self.freqvector[url][word] > 0:
      return (1+log2(self.freqvector[url][word]))
    else:
      return 0


  def idf(self,word):
    return (log2(len(self.freqvector) + 0.5) / (self.docfreqs[word])+1)


# Nota: cuando una palabra aparece en más de la mitad de la colección, resulta un score BM25 negativo.
# Una forma de evitarlo es tomar un número mínimo de documentos como |D|, es decir usar por ejemplo
# |D| = max(20, len(freqvector))

class BM25:
  def __init__(self, freqvector, docfreqs, b, k):
    self.freqvector = freqvector
    self.docfreqs = docfreqs
    self.b = b
    self.k = k
    self.avg = 0
    for d in self.freqvector:
      self.avg += np.array(list(self.freqvector[d].values())).sum()
    self.avg /= len(self.freqvector)


  def search(self, q):
    ranking = []

    def RSJ(w):
      return log2((max(20,len(self.freqvector)) - self.docfreqs[w] + 0.5) / (self.docfreqs[w] + 0.5))

    def f(q, d):
      result = 0
      for word in q:
        result += self.freqvector[d][word]*(self.k + 1)*RSJ(word)/(self.k*(1-self.b + self.b*np.array(list(self.freqvector[d].values())).sum()/self.avg)+ self.freqvector[d][word])
      return result

    for d in self.freqvector.keys():
      ranking.append((d, f(q, d)))

    ranking.sort(key=lambda x: x[1], reverse=True)
    return ranking

class QLD:
  def __init__(self, freqvector, wordfreqs, mu):
    self.freqvector = freqvector
    self.wordfreqs = wordfreqs
    self.mu = mu

  def search(self, q):
    ranking = []
    def p(w):
      result = 0
      contador = 0
      for d, word_frec in self.freqvector.items():
        result += word_frec[w]
        contador += np.array(list(word_frec.values())).sum()
      return result/contador

    def f(q, d):
      result = 1
      for w in q:
        result *= (self.freqvector[d][w] + self.mu * p(w))/(np.array(list(self.freqvector[d].values())).sum() + self.mu)
      return result

    for d in self.freqvector.keys():
      ranking.append((d, f(q, d)))
    ranking.sort(key=lambda x: x[1], reverse=True)
    return ranking

### Programa de prueba

##### 1.  Colección

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from collections import Counter
import re

# La colección: una pequeña lista de URLs web.
urls = ["https://en.wikipedia.org/wiki/Age_of_Enlightenment",
        "https://en.wikipedia.org/wiki/Rationalism",
        "https://en.wikipedia.org/wiki/Scientific_Revolution",
        "https://en.wikipedia.org/wiki/French_Revolution",
        "https://en.wikipedia.org/wiki/Winner%27s_curse",
        "https://en.wikipedia.org/wiki/Simpson%27s_paradox",
        "https://en.wikipedia.org/wiki/Friendship_paradox",
        "https://en.wikipedia.org/wiki/Condorcet_paradox",
        "https://en.wikipedia.org/wiki/Paradox_of_value",
        "https://en.wikipedia.org/wiki/Ship_of_Theseus"
       ]

# Leemos los documentos y quitamos las marcas HTML.
texts = [BeautifulSoup(urlopen(url).read(), "lxml").text.lower() for url in urls]

# Una lista ad-hoc de stopwords.
stoplist = ["also", "could", "p", "pp", "th", "however", "one", "two", "many", "i", "de", "la", "me", "my", "myself", "the", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

##### 2.  Extracción y construcción de bag of words

In [None]:
# Nos abstraemos de los detalles reales de indexación, y utilizaremos un manejo muy simplificado del texto.

# Vector de frecuencias para todos los documentos de la colección, usando la subclase de diccionario collections.Counter.
# Para cada documento, separamos el texto en lista de palabras, y Counter genera un diccionario palabra:frecuencia.
# Se construye un diccionario url -> word -> count (se denomina un "índice forward").
freqvector = {url:Counter([word for word in re.findall(r"[^\W\d_]+|\d+", text) if word not in stoplist]) for url, text in zip(urls, texts)}

# Guardamos el vocabulario (el conjunto de todas las palabras que apaercen en los documentos de la colección).
vocabulary = set()
for word in freqvector.values(): vocabulary.update(word)

# Document frequency de cada palabra del vocabulario: nº de documentos que contienen la palabra.
docfreqs = {word:len([url for url in freqvector if word in freqvector[url]]) for word in vocabulary}

# Frecuencia total para cada palabra del vocabulario: nº total de apariciones en la colección.
wordfreqs = {word:sum([freqvector[url][word] for url in freqvector if word in freqvector[url]]) for word in vocabulary}

##### 3.  Consultas de prueba

In [None]:
# Probamos tres consultas.
for q in [['descartes', 'montesquieu'], ['thought', 'experiment', 'identity'], ['market', 'paradox']]:
  print('\n------------------------------')
  print('Query:', q)
  print('\nModelo vectorial')
  for url, score in VSM(freqvector, docfreqs).search(q):
    print(score, url)
  print('\nBM25')
  for url, score in BM25(freqvector, docfreqs,  b=0.5, k=1).search(q):
    print(score, url)
  print('\nQuery likelihood + Dirichlet')
  for url, score in QLD(freqvector, wordfreqs, mu=100).search(q):#Se podría tunear el mu para cambiar la importancia de que aparezca una palabra en los documentos
    print(score, url)


------------------------------
Query: ['descartes', 'montesquieu']

Modelo vectorial
0.0446170285368405 https://en.wikipedia.org/wiki/Rationalism
0.02937428899853574 https://en.wikipedia.org/wiki/Age_of_Enlightenment
0.018109133380976798 https://en.wikipedia.org/wiki/Scientific_Revolution
0.01715528918441345 https://en.wikipedia.org/wiki/Ship_of_Theseus
0.011727729675312673 https://en.wikipedia.org/wiki/French_Revolution

BM25
7.948698678858268 https://en.wikipedia.org/wiki/Age_of_Enlightenment
3.645918143325567 https://en.wikipedia.org/wiki/French_Revolution
3.6370779842800314 https://en.wikipedia.org/wiki/Rationalism
3.0861571954139713 https://en.wikipedia.org/wiki/Scientific_Revolution
2.232796011063497 https://en.wikipedia.org/wiki/Ship_of_Theseus
0.0 https://en.wikipedia.org/wiki/Winner%27s_curse
0.0 https://en.wikipedia.org/wiki/Simpson%27s_paradox
0.0 https://en.wikipedia.org/wiki/Friendship_paradox
0.0 https://en.wikipedia.org/wiki/Condorcet_paradox
0.0 https://en.wikipedia.or

# 2. Búsqueda con funciones no supervisadas implementadas en la librería PyTerrier.

In [None]:
# Utlizamos la librería de motor de búsqueda PyTerrier.
!pip install python-terrier
!pip install ir-measures

import pyterrier as pt
from pyterrier.measures import *
if not pt.started(): pt.init()
pt.logging('ERROR')

Collecting python-terrier
  Downloading python-terrier-0.10.0.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting deprecated (from python-terrier)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting chest (from pyt

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



### Ejemplo

In [None]:
def printsearch(name, model, q):
  print('\n' + name)
  # Eliminamos todas las columnas del dataframe menos score y docno
  print((model).search(q)[['score', 'docno']].to_string(index=False))

def eval(names, models, queries, qrels, metrics, sort=[], baseline=None):
  # La clase Experiment ejecuta rankers sobre una batería de consultas, y calcula métricas.
  # El parámetro "baseline" hace que se añadan p-valores (y nº de consultas ganadas/perdidad) respecto a uno de los rankers.
  # Con el parámetro "sort" se ordena la tabla de métricas la métrica que se indique.
  print(pt.Experiment(models, queries, qrels, metrics, names, baseline=baseline).sort_values(str(sort), ascending=False).to_string(index=False))

# Accedemos a una colección "Vaswani" ya construida e indexada en PyTerrier.
dataset = pt.get_dataset('vaswani')
# index = pt.IndexFactory.of(dataset.get_index())
index = pt.IndexFactory.of(pt.IterDictIndexer('./index').index(pt.get_dataset('vaswani').get_corpus_iter()))
queries = dataset.get_topics()
qrels = dataset.get_qrels()

# Sacamos una consulta cualquiera del conjunto de datos para probar los rankers.
q = queries[queries.qid=='5']['query'].values[0]

# Creamos y probamos rankers no supervisados por VSM, BM25, QLD y otros.
vsm = pt.BatchRetrieve(index, wmodel='TF_IDF')
bm25 = pt.BatchRetrieve(index, wmodel='BM25')
qld = pt.BatchRetrieve(index, wmodel='DirichletLM')
pl2 = pt.BatchRetrieve(index, wmodel='PL2')
dph = pt.BatchRetrieve(index, wmodel='DPH')
print('Query:', q)
# El operador "%n" de PyTerrier pide al modelo que produzca sólo el top n del ranking.
printsearch('Modelo vectorial', vsm%10, q)
printsearch('BM25', bm25%10, q)
printsearch('Query likelihood Dirichlet', qld%10, q)

# Calculamos varias métricas
eval(['Modelo vectorial', 'BM25', 'Query likelihood Dirichlet', 'DFR Poisson Laplace', 'DFR DPH'],
     [vsm%50, bm25%50, qld%50, pl2%50, dph%50],
     queries, qrels, [P@10, nDCG, nDCG@10, RR], baseline=0, sort=nDCG@10)

Downloading vaswani corpus to /root/.pyterrier/corpora/vaswani/corpus


doc-text.trec:   0%|          | 0.00/0.99M [00:00<?, ?iB/s]

Downloading vaswani topics to /root/.pyterrier/corpora/vaswani/query-text.trec


query-text.trec:   0%|          | 0.00/3.05k [00:00<?, ?iB/s]

Downloading vaswani qrels to /root/.pyterrier/corpora/vaswani/qrels


qrels:   0%|          | 0.00/6.63k [00:00<?, ?iB/s]

Query: use of programs in engineering testing of computers

Modelo vectorial
    score docno
14.442821  1586
12.833733 11429
10.506225  7875
 9.889541  3559
 9.809790  2290
 9.737880  5130
 9.486759  4307
 9.451995  4308
 9.403773  9165
 9.344073 10156

BM25
    score docno
26.192890  1586
23.252472 11429
18.996809  7875
18.066157  3559
17.749907  2290
17.594194  5130
17.129366  4307
17.090606  4308
17.045979  9165
16.895468 10156

Query likelihood Dirichlet
   score docno
3.305896  1586
3.287824  2373
3.002635 11429
2.937968  3559
2.570817  4307
2.495430  4709
2.434525   276
2.379995  5538
2.292910  7875
2.283731  4308
                      name       RR     P@10     nDCG  nDCG@10  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
                      BM25 0.725126 0.352688 0.442753 0.446609  13.0   3.0    0.026092     2.0     8.0  5.734926e-02    38.0    41.0  2.942729e-01       16.0       23.0     6.300102e-01


### Ejercicio: probar más modelos no supervisados y métricas de PyTerrier

La lista completa de rankers de PyTerrier está disponible en http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html.

La lista de métricas en https://pyterrier.readthedocs.io/en/latest/experiments.html.

Hemos probado los modelos BB2, DFR_BM25, DFRee, DLH13, IFB2 y hemos añadido las métricas MAP (Mean Average Precision), la precisión para los 5 primeros resultados (P@5), el normalized Discounted Cumulative Gain para los 5 primeros resultados (nDCG@5), el recall para los 10 primeros (R@10), el recall para los 5 primeros (R@5) y el número de documentos relevantes recuperados (NumRelRet)

In [None]:
bb2 = pt.BatchRetrieve(index, wmodel='BB2')
dfr_bm25 = pt.BatchRetrieve(index, wmodel='DFR_BM25')
dfree = pt.BatchRetrieve(index, wmodel = 'DFRee')
dlh13 = pt.BatchRetrieve(index, wmodel = 'DLH13')
ifb2 = pt.BatchRetrieve(index, wmodel = 'IFB2')

eval(['Modelo vectorial', 'BM25', 'Query likelihood Dirichlet', 'DFR Poisson Laplace', 'DFR DPH'
      ,'BB2', 'DFR_BM25', 'DFRee', 'DLH13', 'IFB2'],
     [vsm%50, bm25%50, qld%50, pl2%50, dph%50, bb2%50, dfr_bm25%50, dfree%50, dlh13%50, ifb2%50],
     queries, qrels, [P@10, nDCG, nDCG@10, RR, AP, NumRelRet,R@10, R@5], baseline=0, sort=nDCG@10)

                      name  NumRet(rel=1)       AP       RR     P@10      R@5     R@10     nDCG  nDCG@10  NumRet(rel=1) +  NumRet(rel=1) -  NumRet(rel=1) p-value  AP +  AP -   AP p-value  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  R@5 +  R@5 -  R@5 p-value  R@10 +  R@10 -  R@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
                       BB2       9.709677 0.254397 0.716245 0.380645 0.162723 0.226286 0.450609 0.463713             30.0             12.0           4.336078e-03  59.0  25.0 7.814678e-04  12.0   9.0    0.272563    23.0     6.0  2.154786e-03   13.0   10.0 9.377337e-01    23.0     6.0      0.218458    60.0    24.0  8.166929e-03       44.0       24.0     1.960054e-03
                      IFB2       9.559140 0.253610 0.708480 0.376344 0.165677 0.223418 0.448382 0.458562             24.0             14.0           3.898300e-02  56.0  28.0 1.687820e-04  12.0   7.0    0.520833    19.0     6.0  1.064146e-02    9.0   10.0 4.461871e-0

# 3. Learning to rank con características de texto.

### Ejemplo

In [None]:
import numpy as np
import lightgbm as lgb

# Definimos el vector de características, e indicamos la función de ranking para el primer filtro de candidatos "first stage ranking".
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])

# Vamos a usar el modelo GBDT LambdaMART implementado en LightGBM.
# Configuramos el modelo.
lmart = lgb.LGBMRanker(
    task='train',
    min_data_in_leaf=2,
    max_depth=4,
    num_leaves=2**4,
    objective='lambdarank',
    metric='ndcg',
    importance_type='gain',
    reg_lambda=0.01,
    n_estimators=10,
    verbose=-1,
    random_state=0 # For reproducibility
    )

# Enganchamos al modelo la salida del filtro de candidatos con su vector de características a utilizar
# en el modelo supervisado.
ltr = fsr >> pt.ltr.apply_learned_model(lmart, form='ltr')

# Particionamos los datos (las consultas) en entrenamiento (60%), validación (20%) y test (20%).
np.random.seed(0) # For reproducibility
train, validation, test = np.split(queries, [int(.6*len(queries)), int(.8*len(queries))])

# Entrenamos el modelo usando los juicios de relevancia (qrels). Aunque los qrels incluyen los datos de
# test, la función fit sólo utiliza los juicios asociados a las consultas de entrenamiento y validación.
ltr.fit(train, qrels, validation, qrels)

# Ejecutamos el modelo entrenado sobre una consulta y observamos la salida.
printsearch('BM25 + LambdaMART', ltr%10, q)

# Evaluamos y comparamos.
eval(['Modelo vectorial', 'BM25', 'Query likelihood Dirichlet', 'DFR Poisson Laplace', 'DFR DPH', 'BM25 + LambdaMART'],
     [vsm%50, bm25%50, qld%50, pl2%50, dph%50, ltr%50],
     test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10, baseline=0)


BM25 + LambdaMART
   score docno
1.314159  1586
0.971874 11429
0.947643  7875
0.947643  3559
0.947643  2290
0.947643  5130
0.819120  2511
0.819120  2927
0.819120  2675
0.790350  3039
                      name       RR     P@10     nDCG  nDCG@10  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
                      BM25 0.595052 0.252632 0.347720 0.306262   2.0   1.0    0.606020     1.0     0.0      0.330565     7.0     8.0      0.865871        3.0        4.0         0.569843
          Modelo vectorial 0.596739 0.247368 0.347220 0.304047   NaN   NaN         NaN     NaN     NaN           NaN     NaN     NaN           NaN        NaN        NaN              NaN
         BM25 + LambdaMART 0.548089 0.263158 0.328672 0.297730   4.0   6.0    0.392923     6.0     2.0      0.380004     4.0    13.0      0.067863        7.0        8.0         0.716729
       DFR Poisson Laplace 0.568381 0.242105 0.329377 0.296378   3.0   2

### Ejercicio: variaciones en learning to rank.

Explorar la configuración de parámetros de LambdaMART.

Probar otros métodos learning to rank de Terrier.

Opcional: probar otras características funcionales.

Intentar conseguir al menos una solución que mejore a todas las del ejemplo anterior.

In [None]:
# Ejemplo de características funcionales custom.
import re
def _features(row):
  content = index.getMetaIndex().getItem('text', row['docid'])
  f1 = len(content)
  f2 = len(re.findall(r"[^\W\d_]+|\d+", content))
  return np.array([f1, f2])

extended_fsr = bm25 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2']) ** pt.apply.doc_features(_features)
extended_ltr = extended_fsr >> pt.ltr.apply_learned_model(lmart, form="ltr")
extended_ltr.fit(train, qrels, validation, qrels)
printsearch('Extended BM25 + LambdaMART', extended_ltr%10, q)
eval(['Extended BM25 + LambdaMART'], [extended_ltr%50], test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)



Extended BM25 + LambdaMART
   score docno
1.116621  1586
1.041099 11429
0.847516  7875
0.822619  3559
0.822619  2290
0.822619  5130
0.741470  9165
0.683661  2511
0.683661  2927
0.683661  2675
                      name     P@10     nDCG  nDCG@10       RR
Extended BM25 + LambdaMART 0.247368 0.323922  0.28667 0.527251


Para la exploración de parámetros vamos a variar los siguientes atributos:
- num_leaves
- max_depth
- n_estimators
- learning_rate

In [None]:
#Exploración de parámetros
# Particionamos los datos (las consultas) en entrenamiento (60%), validación (20%) y test (20%).
np.random.seed(0) # For reproducibility
train, validation, test = np.split(queries, [int(.6*len(queries)), int(.8*len(queries))])
num_leaves = [2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610]
names = []
models = []
for l in num_leaves:
  #Comenzamos con BM25
  fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
  #Modelo base
  lmart = lgb.LGBMRanker(
      task='train',
      min_data_in_leaf=2,
      max_depth=4,
      num_leaves=l,
      objective='lambdarank',
      metric='ndcg',
      importance_type='gain',
      reg_lambda=0.01,
      n_estimators=10,
      verbose=-1,
      random_state=0 # For reproducibility
      )
  ltr = fsr >> pt.ltr.apply_learned_model(lmart, form='ltr')
  ltr.fit(train, qrels, validation, qrels)
  names += ['leafs ' + str(l)]
  models += [ltr%50]
eval(names, models, test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)

     name     P@10     nDCG  nDCG@10       RR
 leafs 21 0.263158 0.328672 0.297730 0.548089
 leafs 34 0.263158 0.328672 0.297730 0.548089
 leafs 55 0.263158 0.328672 0.297730 0.548089
 leafs 89 0.263158 0.328672 0.297730 0.548089
leafs 144 0.263158 0.328672 0.297730 0.548089
leafs 233 0.263158 0.328672 0.297730 0.548089
leafs 377 0.263158 0.328672 0.297730 0.548089
leafs 610 0.263158 0.328672 0.297730 0.548089
  leafs 3 0.263158 0.332836 0.293776 0.530117
 leafs 13 0.263158 0.329922 0.293629 0.517126
  leafs 5 0.257895 0.327597 0.291187 0.511160
  leafs 8 0.257895 0.319955 0.288054 0.513264
  leafs 2 0.247368 0.297557 0.238358 0.341352


Observamos que la nDCG es bastante similar entre todos los modelos, parece ser que el número de hojas por árbol no es un parámetro que afecte mucho al rendimiento.

In [None]:
#Exploración de parámetros
# Particionamos los datos (las consultas) en entrenamiento (60%), validación (20%) y test (20%).
np.random.seed(0) # For reproducibility
train, validation, test = np.split(queries, [int(.6*len(queries)), int(.8*len(queries))])
max_depth = [0, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610]
names = []
models = []
for m in max_depth:
  #Comenzamos con BM25
  fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
  #Modelo base
  lmart = lgb.LGBMRanker(
      task='train',
      min_data_in_leaf=2,
      max_depth=m,
      num_leaves=2**4,
      objective='lambdarank',
      metric='ndcg',
      importance_type='gain',
      reg_lambda=0.01,
      n_estimators=10,
      verbose=-1,
      random_state=0 # For reproducibility
      )
  ltr = fsr >> pt.ltr.apply_learned_model(lmart, form='ltr')
  ltr.fit(train, qrels, validation, qrels)
  names += ['depth ' + str(m)]
  models += [ltr%50]
eval(names, models, test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)

     name     P@10     nDCG  nDCG@10       RR
  depth 2 0.263158 0.327876 0.293226 0.524100
  depth 3 0.257895 0.323235 0.288000 0.518774
  depth 8 0.247368 0.302079 0.281040 0.499434
  depth 0 0.236842 0.316500 0.279310 0.533117
 depth 13 0.236842 0.316500 0.279310 0.533117
 depth 21 0.236842 0.316500 0.279310 0.533117
 depth 34 0.236842 0.316500 0.279310 0.533117
 depth 55 0.236842 0.316500 0.279310 0.533117
 depth 89 0.236842 0.316500 0.279310 0.533117
depth 144 0.236842 0.316500 0.279310 0.533117
depth 233 0.236842 0.316500 0.279310 0.533117
depth 377 0.236842 0.316500 0.279310 0.533117
depth 610 0.236842 0.316500 0.279310 0.533117
  depth 5 0.236842 0.322978 0.278434 0.543915
  depth 1 0.247368 0.297557 0.238358 0.341352


Observamos que aumentar la profundidad de los árboles resulta detrimental para el rendimiento del modelo.

In [None]:
#Exploración de parámetros
# Particionamos los datos (las consultas) en entrenamiento (60%), validación (20%) y test (20%).
np.random.seed(0) # For reproducibility
train, validation, test = np.split(queries, [int(.6*len(queries)), int(.8*len(queries))])
n_estimators = [20, 50, 100, 200, 500, 1000]
names = []
models = []
for n in n_estimators:
  #Comenzamos con BM25
  fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
  #Modelo base
  lmart = lgb.LGBMRanker(
      task='train',
      min_data_in_leaf=2,
      max_depth=4,
      num_leaves=2**4,
      objective='lambdarank',
      metric='ndcg',
      importance_type='gain',
      reg_lambda=0.01,
      n_estimators=n,
      verbose=-1,
      random_state=0 # For reproducibility
      )
  ltr = fsr >> pt.ltr.apply_learned_model(lmart, form='ltr')
  ltr.fit(train, qrels, validation, qrels)
  names += ['n_est ' + str(n)]
  models += [ltr%50]
eval(names, models, test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)

      name     P@10     nDCG  nDCG@10       RR
  n_est 20 0.257895 0.325740 0.285309 0.522974
 n_est 100 0.252632 0.315988 0.266970 0.441862
  n_est 50 0.231579 0.315387 0.255714 0.455963
 n_est 200 0.210526 0.307483 0.240431 0.457506
 n_est 500 0.200000 0.288669 0.222714 0.425666
n_est 1000 0.152632 0.265119 0.192050 0.437955


Aumentar los estimadores tampoco mejora notablemente el rendimiento.

In [None]:
#Exploración de parámetros
# Particionamos los datos (las consultas) en entrenamiento (60%), validación (20%) y test (20%).
np.random.seed(0) # For reproducibility
train, validation, test = np.split(queries, [int(.6*len(queries)), int(.8*len(queries))])
names = []
models = []
learning_rate = [0.01, 0.02 ,0.05, 0.1, 0.2]
for lr in learning_rate:
  #Comenzamos con BM25
  fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
  #Modelo base
  lmart = lgb.LGBMRanker(
      task='train',
      min_data_in_leaf=2,
      max_depth=4,
      num_leaves=2**4,
      objective='lambdarank',
      metric='ndcg',
      importance_type='gain',
      reg_lambda=0.01,
      n_estimators=10,
      verbose=-1,
      learning_rate = lr,
      random_state=0 # For reproducibility
      )
  ltr = fsr >> pt.ltr.apply_learned_model(lmart, form='ltr')
  ltr.fit(train, qrels, validation, qrels)
  names += ['lr ' + str(lr)]
  models += [ltr%50]
eval(names, models, test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)

   name     P@10     nDCG  nDCG@10       RR
 lr 0.1 0.263158 0.328672 0.297730 0.548089
lr 0.05 0.263158 0.331144 0.295564 0.524123
lr 0.02 0.257895 0.320864 0.288798 0.519593
lr 0.01 0.247368 0.326757 0.282578 0.516538
 lr 0.2 0.231579 0.330366 0.275283 0.521553


Valores pequeños del learning rate parecen ser beneficiosos para mejorar el rendimiento de los modelos.

Terminamos haciendo un grid de un subconjunto de todos hiperparámetros que hemos analizado.

In [None]:
#Exploración de parámetros
# Particionamos los datos (las consultas) en entrenamiento (60%), validación (20%) y test (20%).
np.random.seed(0) # For reproducibility
train, validation, test = np.split(queries, [int(.6*len(queries)), int(.8*len(queries))])
num_leaves = [5, 8,10]
max_depth = [2, 3, 5]
n_estimators = [20, 50, 100]
learning_rate = [0.01, 0.05, 0.1]
names = []
models = []
for l in num_leaves:
  for m in max_depth:
    for n in n_estimators:
      for lr in learning_rate:
        #Comenzamos con BM25
        fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
        #Modelo base
        lmart = lgb.LGBMRanker(
            task='train',
            min_data_in_leaf=2,
            max_depth=m,
            num_leaves=l,
            objective='lambdarank',
            metric='ndcg',
            importance_type='gain',
            reg_lambda=0.01,
            n_estimators=n,
            verbose=-1,
            learning_rate = lr,
            random_state=0 # For reproducibility
            )
        ltr = fsr >> pt.ltr.apply_learned_model(lmart, form='ltr')
        ltr.fit(train, qrels, validation, qrels)
        names += ['l' + str(l) + ' m' + str(m) + ' n' + str(n) + ' lr' + str(lr)]
        models += [ltr%50]
eval(names, models, test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)

                  name     P@10     nDCG  nDCG@10       RR
l 10 m 2 n 100 lr 0.05 0.268421 0.333629 0.310755 0.572830
 l 8 m 2 n 100 lr 0.05 0.268421 0.333629 0.310755 0.572830
 l 5 m 2 n 100 lr 0.05 0.268421 0.333629 0.310755 0.572830
  l 10 m 2 n 50 lr 0.1 0.268421 0.330636 0.306130 0.544838
   l 8 m 2 n 50 lr 0.1 0.268421 0.330636 0.306130 0.544838
   l 5 m 2 n 50 lr 0.1 0.268421 0.330636 0.306130 0.544838
 l 10 m 5 n 20 lr 0.01 0.263158 0.324123 0.304445 0.593837
  l 5 m 2 n 50 lr 0.05 0.263158 0.335012 0.301662 0.546159
  l 8 m 2 n 50 lr 0.05 0.263158 0.335012 0.301662 0.546159
 l 10 m 2 n 50 lr 0.05 0.263158 0.335012 0.301662 0.546159
  l 8 m 2 n 20 lr 0.05 0.273684 0.329880 0.300790 0.529785
 l 10 m 2 n 20 lr 0.05 0.273684 0.329880 0.300790 0.529785
  l 5 m 2 n 20 lr 0.05 0.273684 0.329880 0.300790 0.529785
  l 10 m 2 n 20 lr 0.1 0.263158 0.334098 0.300189 0.546159
   l 5 m 2 n 20 lr 0.1 0.263158 0.334098 0.300189 0.546159
   l 8 m 2 n 20 lr 0.1 0.263158 0.334098 0.300189 0.5461

En este caso hemos obtenido un modelo mejor que el BM25 + LambdaMART y superior a BM25 en la métrica nDCG@10.

Para usar otros métodos de learning to rank vamos a usar la colección de modelos de sklearn y otros predictores como baseline.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

#RandomForest
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
rf = RandomForestRegressor(n_estimators=200)
rf_pipe = fsr >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train, qrels, validation, qrels)

#SVM kernel gaussiano
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
svr_rbf = SVR(kernel="rbf", gamma=0.1, epsilon=0.1)
rbf_pipe = fsr >> pt.ltr.apply_learned_model(svr_rbf)
rbf_pipe.fit(train, qrels, validation, qrels)

#SVM kernel lineal
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BM25'}, features=['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2'])
svr_lin = SVR(kernel="linear", gamma="auto")
lin_pipe = fsr >> pt.ltr.apply_learned_model(svr_lin)
lin_pipe.fit(train, qrels, validation, qrels)


eval(['BM25','BM25 + LambdaMART' ,'BM25 + Random Forest', 'BM25 + SVR rbf', 'BM25 + SVR lin'],
     [bm25%50, ltr%50, rf_pipe%50, rbf_pipe%50, lin_pipe%50],
     test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10, baseline=0)

                name       RR     P@10     nDCG  nDCG@10  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
                BM25 0.595052 0.252632 0.347720 0.306262   NaN   NaN         NaN     NaN     NaN           NaN     NaN     NaN           NaN        NaN        NaN              NaN
      BM25 + SVR lin 0.593235 0.247368 0.339905 0.303418   2.0   2.0    0.519599     0.0     1.0      0.330565     5.0    11.0      0.070252        6.0        4.0         0.437400
   BM25 + LambdaMART 0.548089 0.263158 0.328672 0.297730   4.0   5.0    0.407554     5.0     2.0      0.541631     5.0    12.0      0.063186        8.0        7.0         0.631685
BM25 + Random Forest 0.444523 0.205263 0.254686 0.241619   3.0   8.0    0.067365     5.0     8.0      0.119603     3.0    15.0      0.001426        5.0       10.0         0.093339
      BM25 + SVR rbf 0.434068 0.173684 0.167361 0.211559   5.0   6.0    0.062023     2.0     9.0    

In [None]:
#Probamos con BB2

#RandomForest
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BB2'}, features=['SAMPLE', 'WMODEL:BM25' , 'WMODEL:DirichletLM', 'WMODEL:PL2'])
rf = RandomForestRegressor(n_estimators=200)
rf_pipe = fsr >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train, qrels, validation, qrels)

#SVM kernel gaussiano
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BB2'}, features=['SAMPLE', 'WMODEL:BM25' , 'WMODEL:DirichletLM', 'WMODEL:PL2'])
svr_rbf = SVR(kernel="rbf", gamma=0.1, epsilon=0.1)
rbf_pipe = fsr >> pt.ltr.apply_learned_model(svr_rbf)
rbf_pipe.fit(train, qrels, validation, qrels)

#SVM kernel lineal
fsr = pt.FeaturesBatchRetrieve(index, controls = {'wmodel': 'BB2'}, features=['SAMPLE', 'WMODEL:BM25' , 'WMODEL:DirichletLM', 'WMODEL:PL2'])
svr_lin = SVR(kernel="linear", gamma="auto")
lin_pipe = fsr >> pt.ltr.apply_learned_model(svr_lin)
lin_pipe.fit(train, qrels, validation, qrels)


eval(['BB2','BB2 + LambdaMART' ,'BB2 + Random Forest', 'BB2 + SVR rbf', 'BB2 + SVR lin'],
     [bb2%50, ltr%50, rf_pipe%50, rbf_pipe%50, lin_pipe%50],
     test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10, baseline=0)

               name       RR     P@10     nDCG  nDCG@10  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
                BB2 0.595052 0.252632 0.347720 0.306262   NaN   NaN         NaN     NaN     NaN           NaN     NaN     NaN           NaN        NaN        NaN              NaN
   BB2 + LambdaMART 0.548089 0.263158 0.328672 0.297730   4.0   5.0    0.407554     5.0     2.0      0.541631     5.0    12.0      0.063186        8.0        7.0         0.631685
      BB2 + SVR lin 0.465507 0.189474 0.265397 0.226269   4.0   7.0    0.161507     2.0     9.0      0.014002     5.0    13.0      0.010527        4.0       11.0         0.023514
BB2 + Random Forest 0.364167 0.200000 0.241777 0.201543   3.0  11.0    0.044433     3.0     9.0      0.115986     3.0    14.0      0.002438        3.0       14.0         0.013263
      BB2 + SVR rbf 0.452924 0.147368 0.157309 0.194610   3.0   8.0    0.106800     1.0    10.0      0.01

Pasamos a añadir características funcionales sobre los documentos y las queries para tratar de aportar más datos a los modelos, en particular hacemos una función que elimina algunas palabras en las queries y añadimos una feature que devuelve el porcentaje de palabras de la query presentes en el documento y otra que calcula la longitud de la query. Para comprobar su rendimiento lo probamos con un modelo que implementa BM25 y un lambdaMART.

In [None]:
# Ejemplo de características funcionales custom.
import re

def getTerms(docid):
  di = index.getDirectIndex()
  doi = index.getDocumentIndex()
  lex = index.getLexicon()
  res = ""
  #NB: postings will be null if the document is empty
  for posting in di.getPostings(doi.getDocumentEntry(docid)):
      termid = posting.getId()
      lee = lex.getLexiconEntry(termid)
      res += lee.getKey() + ' '
  return res

stops=set(["and", "the", "of", "use", "in", "on", "by"])
def _remove_stops(q):
    terms = q["query"].split(" ")
    terms = [t for t in terms if not t in stops ]
    return " ".join(terms)

def _features(row):
  content = getTerms(row['docid'])
  myq = row['query']
  common = set(myq.split()) & set(content.split())
  f1 = len(content)
  if np.isnan(f1):
    f1=0
  f2 = len(re.findall(r"[^\W\d_]+|\d+", content))
  if np.isnan(f2):
    f2=0
  f3 = len(common)/len(set(myq.split()))
  if np.isnan(f3):
    f3=0
  f4 = len(myq)
  if np.isnan(f4):
    f4=0
  return np.array([f1, f2,f3, f4])


extended_fsr = pt.apply.query(_remove_stops) >> bm25 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2']) ** pt.apply.doc_features(_features)
extended_ltr = extended_fsr >> pt.ltr.apply_learned_model(lmart, form="ltr")
extended_ltr.fit(train, qrels, validation, qrels)
printsearch('Extended BM25 + LambdaMART', extended_ltr%10, q)
eval(['Extended BM25 + LambdaMART'], [extended_ltr%50], test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10)


Extended BM25 + LambdaMART
   score docno
2.389805  1586
2.101079 11429
0.802714  3559
0.759680  7875
0.705736  5130
0.682963  2290
0.572797  9165
0.501424  4307
0.447563  5429
0.379482  3374
                      name     P@10     nDCG  nDCG@10       RR
Extended BM25 + LambdaMART 0.268421 0.341857  0.31164 0.575856


Para acabar esta sección creamos varios modelos extendidos mediante las features y árboles lambdaMART con los mejores hiperparámetros que encotramos en la sección. A continuación, comparamos su rendimiento con los de la primera tabla del apartado.

In [None]:
import numpy as np
import lightgbm as lgb

# Definimos el vector de características, e indicamos la función de ranking para el primer filtro de candidatos "first stage ranking".

# Vamos a usar el modelo GBDT LambdaMART implementado en LightGBM.
# Configuramos el modelo.
lmart = lgb.LGBMRanker(
    task='train',
    min_data_in_leaf=2,
    max_depth=2,
    num_leaves=100,
    objective='lambdarank',
    metric='ndcg',
    importance_type='gain',
    reg_lambda=0.01,
    n_estimators=100,
    learning_rate = 0.05,
    verbose=-1,
    random_state=0 # For reproducibility
    )


#Extendido basado en BM25 + LambdaMART
extended_fsr = pt.apply.query(_remove_stops) >> bm25 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2', 'WMODEL:BB2']) ** pt.apply.doc_features(_features)
extended_ltr = extended_fsr >> pt.ltr.apply_learned_model(lmart, form="ltr")
extended_ltr.fit(train, qrels, validation, qrels)

#Extendido basado en BB2 + LambdaMART
bb2 = pt.BatchRetrieve(index, wmodel='BB2')
extended_fsr_2 = pt.apply.query(_remove_stops) >> bb2 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2', 'WMODEL:BM25']) ** pt.apply.doc_features(_features)
extended_ltr_2 = extended_fsr_2 >> pt.ltr.apply_learned_model(lmart, form="ltr")
extended_ltr_2.fit(train, qrels, validation, qrels)

# Evaluamos y comparamos.
eval(['Modelo vectorial', 'BM25', 'Query likelihood Dirichlet', 'DFR Poisson Laplace', 'DFR DPH', 'Extendido BM25 + LambdaMART', 'Extendido BB2 + LambdaMART'],
     [vsm%50, bm25%50, qld%50, pl2%50, dph%50, extended_ltr%50,extended_ltr_2%50],
     test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10, baseline=0)

                       name       RR     P@10     nDCG  nDCG@10  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
 Extendido BB2 + LambdaMART 0.612711 0.268421 0.363847 0.318225   5.0   4.0    0.841426     5.0     2.0      0.214558    11.0     7.0      0.206751        7.0        7.0         0.398563
Extendido BM25 + LambdaMART 0.524027 0.289474 0.325914 0.311056   6.0   4.0    0.393571    10.0     3.0      0.041861     8.0    10.0      0.171089        9.0        7.0         0.772009
                       BM25 0.595052 0.252632 0.347720 0.306262   2.0   1.0    0.606020     1.0     0.0      0.330565     7.0     8.0      0.865871        3.0        4.0         0.569843
           Modelo vectorial 0.596739 0.247368 0.347220 0.304047   NaN   NaN         NaN     NaN     NaN           NaN     NaN     NaN           NaN        NaN        NaN              NaN
        DFR Poisson Laplace 0.568381 0.242105 0.329377 0.296378  

Como podemos comprobar, los dos modelos nuevos son ligeramente mejores que el resto.

# 4. Opcional: dense retrieval.

### Ejemplo: obtención y visualización de embeddings.

In [None]:
# Extracción de embeddings y embeddings pre-entrenados.
!pip install --upgrade --ignore-installed gensim
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np

# Embeddings para la colección de juguete del ejercicio 1 (reutilizando la variable "texts").
words = [[word for word in simple_preprocess(text) if word not in stoplist] for text in texts]
model = Word2Vec(words).wv

!wget https://huggingface.co/LoganKilpatrick/GoogleNews-vectors-negative300/resolve/main/GoogleNews-vectors-negative300.bin.gz
premodel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

print(model.most_similar(['paradox'], topn=20))
print(premodel.most_similar(['paradox'], topn=20))

def reduce_dimensions(model):
  from sklearn.decomposition import IncrementalPCA  # inital reduction
  from sklearn.manifold import TSNE                 # final reduction
  num_dimensions = 2                                # final num dimensions (2D, 3D, etc)
  tsne = TSNE(n_components=num_dimensions, random_state=0)
  vectors = tsne.fit_transform(np.asarray(model.vectors))
  return [v[0] for v in vectors], [v[1] for v in vectors], np.asarray(model.index_to_key)

def plot(x_vals, y_vals, labels):
  import matplotlib.pyplot as plt
  import random
  plt.figure(figsize=(12, 12))
  plt.scatter(x_vals, y_vals, facecolors='none', edgecolors='b', linewidth=.5, s=80, alpha=.5)
  for i in random.sample(list(range(len(labels))), 25): plt.annotate(labels[i], (x_vals[i], y_vals[i]))

plot(*reduce_dimensions(model))

### Ejercicio: dense retrieval con embeddings.

Definir una función de ránking no supervisado basada en embeddings. Utilizar las implementaciones de gensym y la colección de juguete del ejercicio 1, u otras opciones a elección del estudiante.

La idea que se nos ha ocurrido para utilizar los embeddings para realizar ránking es la de obtener la similitud entre la query de entrada con todos los documentos y ordenar estos resultados de mayor a menor. Tomamos el vector de embeddings de un documento y de una query como la media de los embeddings de cada palabra presente en el texto. Una vez obtenidos estos vectores, los comparamos usando la similitud del coseno. Tenemos dos colecciones de embeddings con los que trabajar: el general de Google y el generado por los propios textos.

Presentamos el código de esta implementación:

In [None]:
#Precalculamos el embedding medio de cada documento

def get_mean_emb(text, embeding):
  word_vecs = []
  for word in simple_preprocess(text):
    try:
      v = embeding[word]
      word_vecs.append(v)
    except KeyError:
      pass
  return np.mean(word_vecs, axis=0)

embeding_model = [get_mean_emb(text, model) for text in texts]
embeding_premodel = [get_mean_emb(text, premodel) for text in texts]

In [None]:
#Definimos la funcion de ranking dado un embedding

def rank_emb(query, embeding, mean_emb):
  v_q = get_mean_emb(query, embeding)
  result = []
  for i in range(len(mean_emb)):
    v_d = mean_emb[i]
    csim = np.dot(v_q, v_d) / (np.linalg.norm(v_q) * np.linalg.norm(v_d))
    if np.isnan(np.sum(csim)):
      csim = 0
    result.append((list(freqvector.keys())[i], csim))
  result.sort(key=lambda x: x[1], reverse=True)
  return result

In [None]:
# Probamos tres consultas.
for q in [['descartes', 'montesquieu'], ['thought', 'experiment', 'identity'], ['market', 'paradox']]:
  print('\n------------------------------')
  print('Query:', q)
  print('\nEmbedding de los textos')
  for url, score in rank_emb(" ".join(q), model, embeding_model):
    print(score, url)
  print('\nEmbedding de Google')
  for url, score in rank_emb(" ".join(q), premodel, embeding_premodel):
    print(score, url)


------------------------------
Query: ['descartes', 'montesquieu']

Embedding de los textos
0.99974203 https://en.wikipedia.org/wiki/Condorcet_paradox
0.9997304 https://en.wikipedia.org/wiki/Age_of_Enlightenment
0.99972713 https://en.wikipedia.org/wiki/Friendship_paradox
0.9997251 https://en.wikipedia.org/wiki/Paradox_of_value
0.9997225 https://en.wikipedia.org/wiki/Simpson%27s_paradox
0.99972093 https://en.wikipedia.org/wiki/French_Revolution
0.99970716 https://en.wikipedia.org/wiki/Winner%27s_curse
0.99970347 https://en.wikipedia.org/wiki/Rationalism
0.9996895 https://en.wikipedia.org/wiki/Scientific_Revolution
0.9996765 https://en.wikipedia.org/wiki/Ship_of_Theseus

Embedding de Google
0 https://en.wikipedia.org/wiki/Age_of_Enlightenment
0 https://en.wikipedia.org/wiki/Rationalism
0 https://en.wikipedia.org/wiki/Scientific_Revolution
0 https://en.wikipedia.org/wiki/French_Revolution
0 https://en.wikipedia.org/wiki/Winner%27s_curse
0 https://en.wikipedia.org/wiki/Simpson%27s_paradox

Los resultados son coherentes con los embeddings utilizados: en el caso de los sacados de los propios textos las similitudes son cercanas a uno para todos los casos y en caso de usar los embedings de Google obtenemos similitudes más bajas. En el caso de la primera query obtenemos similitudes de cero ya que el embeding no contiene los nombres propios presentes en la consulta.

# 5. Opcional: deep learning to rank.

Aplicar un modelo de deep learning sobre alguno de los conjuntos de datos de los ejercicios anteriores. Por ejemplo, utilizar alguno de los modelos disponibles en PyTerrier.

Usamos la implementación de sklearn de redes neuronales y la aplicamos a un pipeline de PyTerrier con los datos del ejercicio 2.

In [None]:
from sklearn.neural_network import MLPRegressor

#Modelo simple 1 capa oculta de 100 neuronas con BM25

regr = MLPRegressor(random_state=1, max_iter=500)
pipenn1_100 = pt.apply.query(_remove_stops) >> bm25 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2']) ** pt.apply.doc_features(_features)
pipenn1_100 = pipenn1_100 >> pt.ltr.apply_learned_model(regr)
pipenn1_100.fit(train, qrels, validation, qrels)

#Modelo simple 1 capa oculta de 100 neuronas con BB2
regr = MLPRegressor(random_state=1, max_iter=500)
pipenn2_100 = pt.apply.query(_remove_stops) >> bb2 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2']) ** pt.apply.doc_features(_features)
pipenn2_100 = pipenn2_100 >> pt.ltr.apply_learned_model(regr)
pipenn2_100.fit(train, qrels, validation, qrels)

In [None]:
#Modelo con capas ocultas de 120,130,200,100 neuronas con BM25

regr = MLPRegressor(hidden_layer_sizes = [120, 130, 200, 100],random_state=1, max_iter=500)
pipenn1_200 = pt.apply.query(_remove_stops) >> bm25 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2']) ** pt.apply.doc_features(_features)
pipenn1_200 = pipenn1_200 >> pt.ltr.apply_learned_model(regr)
pipenn1_200.fit(train, qrels, validation, qrels)

#Modelo con capas ocultas de 120,130,200,100 neuronas con BB2
regr = MLPRegressor(hidden_layer_sizes = [120, 130, 200, 100], random_state=1, max_iter=500)
pipenn2_200 = pt.apply.query(_remove_stops) >> bb2 >> pt.FeaturesBatchRetrieve(index, ['SAMPLE', 'WMODEL:DirichletLM', 'WMODEL:PL2']) ** pt.apply.doc_features(_features)
pipenn2_200 = pipenn2_200 >> pt.ltr.apply_learned_model(regr)
pipenn2_200.fit(train, qrels, validation, qrels)


In [None]:
# Evaluamos y comparamos.
eval(['Modelo vectorial', 'BM25', 'Query likelihood Dirichlet', 'DFR Poisson Laplace', 'DFR DPH',
       'Simple NN + BM25', 'Simple NN + BB2', 'Complex NN + BM25', 'Complex NN + BB2'],
     [vsm%50, bm25%50, qld%50, pl2%50, dph%50, pipenn1_100%50, pipenn2_100%50,pipenn1_200%50,pipenn2_200%50],
     test, qrels, [P@10, nDCG, nDCG@10, RR], sort=nDCG@10, baseline=0)

                      name       RR     P@10     nDCG  nDCG@10  RR +  RR -  RR p-value  P@10 +  P@10 -  P@10 p-value  nDCG +  nDCG -  nDCG p-value  nDCG@10 +  nDCG@10 -  nDCG@10 p-value
                      BM25 0.595052 0.252632 0.347720 0.306262   2.0   1.0    0.606020     1.0     0.0      0.330565     7.0     8.0      0.865871        3.0        4.0         0.569843
          Modelo vectorial 0.596739 0.247368 0.347220 0.304047   NaN   NaN         NaN     NaN     NaN           NaN     NaN     NaN           NaN        NaN        NaN              NaN
       DFR Poisson Laplace 0.568381 0.242105 0.329377 0.296378   3.0   2.0    0.299321     1.0     2.0      0.577753     5.0    12.0      0.011144        6.0        4.0         0.514222
                   DFR DPH 0.558020 0.247368 0.325935 0.288743   4.0   3.0    0.527410     5.0     4.0      1.000000     7.0    10.0      0.131491        7.0        9.0         0.538287
          Complex NN + BB2 0.516455 0.242105 0.310922 0.284684   6.0  

Los resultados no son malos pero no superan  a los modelos de tipo lambdaMART.