## Evaluation

In [82]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import re

In [83]:
def get_boat_poems():
  poems = []
  # get poems html
  url = 'https://discoverpoetry.com/poems/poems-about-ships/'
  contents = urllib.request.urlopen(url).read()
  soup = BeautifulSoup(contents)
  for poem_html in soup.find_all('article', {'class': 'poem-listing'}):
    poem = re.search('<p class="ExcerptText">(.*?)</blockquote>', str(poem_html), re.DOTALL).groups(1)
    poems.append(poem[0])
  return poems
# parse html

In [84]:
# best way to look at the poems is to go to: https://discoverpoetry.com/poems/poems-about-ships/
# index here = webpage index - 1
corpus = get_boat_poems()

In [85]:
# vectorize and get vocabulary
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()

In [86]:
vocabulary

array(['_three', 'abbot', 'aberbrothok', ..., 'young', 'youth',
       'youthful'], dtype=object)

In [87]:
print ('We have a {} document corpus with a {} term vocabulary'.format(*documents_vectorized.shape))

We have a 25 document corpus with a 1496 term vocabulary


In [88]:
# This is what it looks like
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
doc_ids = df.index.values
df[:5]

Unnamed: 0,_three,abbot,aberbrothok,aboard,ache,ached,aching,action,adieu,adrift,...,years,yeast,yes,yesterday,yielded,yon,yonder,young,youth,youthful
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
def BM25_IDF_df(df):
  """
  This definition calculates BM25-IDF weights before hand as done last week
  """

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)


In [90]:
bm25_df = BM25_IDF_df(df) # a dataframe with BM25-idf weights
bm25_df[:5]

Unnamed: 0,_three,abbot,aberbrothok,aboard,ache,ached,aching,action,adieu,adrift,...,years,yeast,yes,yesterday,yielded,yon,yonder,young,youth,youthful
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.691558,0.0,0.0,3.098937,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,3.439434,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
# In order to evaluate a search engine over this data we need two things:
# 1. Queries 
# 2. Relevance Judgements

# QUERIES dictionary with {query_id: query}
queries = dict(enumerate([
    'ship wreck',
    'sail sailing ship',
    # 'little boat'
    
]))

# RELEVANCE JUDGEMENTS list with [(query_id, document_id, judgement), ...] judgement 0 | 1 with 1 = relevant
qrels = [
         (0, 9, 1),
         (0,11,1),
         (0,12,1),
         (0,13,0),
         (0,14,0),
         (0,0,0),
         (0,24,0),
         (0,17,0),
         (0,3,0),
         (0,4,0),

         (1, 0, 0),
         (1, 1, 0),
         (1, 2, 1),
         (1, 3, 1),
         (1, 5, 0),
         (1, 7, 1),
         (1, 21, 0),
         (1, 23, 1),
        
]


In [92]:
# According to the Relevance Judgements, Is the document entitled 'The Ship is Ready' relevant to the query 'sail sailing ship' ?

# What about the document 'The Wind and the Sea'. Is it relevant to the query 'ship wreck' according to our Relevance Judgements ? 

In [93]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values,score_q_d.values), key = lambda tup:tup[1], reverse=True)

In [96]:
def precision_at_k(query_id, k=5):
  # calculate precision @ k. we've given you a headstart
  # hint: think about precision in terms of true positives and false positives
  # hint: remember set theory from Lab 1? 
    doc_ranking = retrieve_ranking(queries[query_id], bm25_df)
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score 
    rel_docs = [qrel[1] for qrel in qrels if (qrel[0], qrel[2]) == (query_id,1)]
    non_rel_docs = [qrel[1] for qrel in qrels if (qrel[0], qrel[2]) == (query_id,0)]
    precision = len([rel for rel in rel_docs if rel in retrieved])/(len([rel for rel in non_rel_docs if rel in retrieved])+len([rel for rel in rel_docs if rel in retrieved]))
    tp = len([rel for rel in rel_docs if rel in retrieved])
    fp = len([rel for rel in non_rel_docs if rel in retrieved])
    return tp, fp, precision

In [97]:
precision_at_k(0)

(1, 1, 0.5)

In [24]:
qrels[:5]

[(0, 9, 1), (0, 11, 1), (0, 12, 1), (0, 13, 0), (0, 14, 0)]

In [72]:
def f1_score_at_k(query_id, k=5):
  # calculate f_1 score
  # hint: you need to find TP's etc in a similar way to precision at k
    doc_ranking = retrieve_ranking(queries[query_id], bm25_df)
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
    rel_docs = [qrel[1] for qrel in qrels if (qrel[0], qrel[2]) == (query_id,1)]
    rel_docs_total = [qrel[1] for qrel in qrels if (qrel[0], qrel[2]) == (query_id,1)]
    non_rel_docs = [qrel[1] for qrel in qrels if (qrel[0], qrel[2]) == (query_id,0)]
    precision = len([rel for rel in rel_docs if rel in retrieved])/(len([rel for rel in non_rel_docs if rel in retrieved])
                                                                    +len([rel for rel in rel_docs if rel in retrieved]))
    recall = len([rel for rel in rel_docs if rel in retrieved])/len(rel_docs_total)
    return (2*precision*recall)/(precision+recall)

f1_score_at_k(0)

0.4

In [98]:
# To retrieve and calculate accuracy metrics for each query lets loop over them
k = 5
for query_id, query in queries.items():
    tp, fp, precision = precision_at_k(query_id, k=k)
    f1_score = f1_score_at_k(query_id, k=k)
    print('retrieved query "{}" with Precision@{} = {} and F1-score = {}'.format(query, k, precision, f1_score))


retrieved query "ship wreck" with Precision@5 = 0.5 and F1-score = 0.4
retrieved query "sail sailing ship" with Precision@5 = 1.0 and F1-score = 0.6666666666666666


In [None]:
## EXTRA - If you finish early.

# Calculate normalized dcg (ndcg) at k using 
# sklearn library: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html
from sklearn.metrics import ndcg_score

# the things you need is y_true and y_score
# first one comes from qrels and second from document rankings
# The inputs are numpy arrays
# for (n_samples, n_labels) in the documentation, here n_labels = 1