# Indexing + Document Retrieval
## Libraries and useful functions

In [81]:
# Import necessary libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import csv

In [82]:
# loading data
def get_queries():
    return [open("./cranfield/q/" + str(q) + ".txt").read() for q in range(1, 226)]

def get_documents():
    return [open("./cranfield/d/" + str(d + 1) + ".txt").read() for d in range(1400)]

def get_relevant_documents(query_id):
    res = []
    with open("./cranfield/r/{}.txt".format(query_id)) as f:
        for line in f.readlines():
            res.append(int(line))
    return res

def get_data(query):
    data = list(get_documents())
    data.append(query)
    return data
    

In [83]:
# distance measures
def euclidean(data, length):
    sim = np.array(euclidean_distances(data[length], data[0:length])[0])
    return sim.argsort()+1

def cosine(data, length):
    sim = np.array(cosine_similarity(data[length], data[0:length])[0])
    return sim.argsort()[::-1]+1

In [85]:
# score functions
def calculate_precision(retrieved_documents, relevant_documents):
    relevant_count = 0
    for doc in retrieved_documents:
        if doc in relevant_documents:
            relevant_count += 1

    return relevant_count / len(retrieved_documents)

def calculate_recall(retrieved_documents, relevant_documents):
    relevant_count = 0
    for doc in retrieved_documents:
        if doc in relevant_documents:
            relevant_count += 1

    return relevant_count / len(relevant_documents)

def calculate_fmeasure(precision, recall):
    if precision == 0 and recall == 0:
        return 0

    return 2 * (precision * recall) / (precision + recall)

## Implementation of Binary, Term frequency and TFIDF weighting schemas

In [84]:
# weighting schemas
def binary(query):
    data = get_data(query)

    vectorizer = CountVectorizer(binary=True)
    count_array = vectorizer.fit_transform(data)
    
    return euclidean(count_array, len(data)-1), cosine(count_array, len(data)-1)

def term_frequency(query):
    data = get_data(query)
    
    tf_vectorizer = CountVectorizer()
    tf_matrix = tf_vectorizer.fit_transform(data)
    
    sum_x = tf_matrix.sum(1)
    new_tf_matrix = tf_matrix.multiply(1 / sum_x)
    new_tf_matrix = scipy.sparse.csc_matrix(new_tf_matrix)
    
    return euclidean(new_tf_matrix, len(data)-1), cosine(new_tf_matrix, len(data)-1)

def tfidf(query):
    data = get_data(query)
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    
    return euclidean(tfidf_matrix, len(data)-1), cosine(tfidf_matrix, len(data)-1)

In [86]:
# global variables
results = []
N = 15 # limit to top N


# set up csv file
header = ['Euclidean binary precision',
'Euclidean binary recall',
'Euclidean binary F-measure',
'Euclidean Term Frequency precision',
'Euclidean Term Frequency recall',
'Euclidean Term Frequency F-measure',
'Euclidean TF-IDF precision',
'Euclidean TF-IDF recall',
'Euclidean TF-IDF F-measure',
'Cosine binary precision',
'Cosine binary recall',
'Cosine binary F-measure',
'Cosine Term Frequency precision',
'Cosine Term Frequency recall',
'Cosine Term Frequency F-measure',
'Cosine TF-IDF precision',
'Cosine TF-IDF recall',
'Cosine TF-IDF F-measure', '\n']

with open('standard_output.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(header)
    csvFile.close()   

# perform task for each query
query_id = 1   
queries = get_queries()
for query in queries:
    binary_result = binary(query)
    tf_result = term_frequency(query)
    tfidf_result = tfidf(query)

    relevant_docs = get_relevant_documents(query_id)

    e_binary_precision = calculate_precision(binary_result[0][:N], relevant_docs)
    e_binary_recall = calculate_recall(binary_result[0][:N], relevant_docs)
    e_binary_fmeasure = calculate_fmeasure(e_binary_precision, e_binary_recall)
    e_tf_precision = calculate_precision(tf_result[0][:N], relevant_docs)
    e_tf_recall = calculate_recall(tf_result[0][:N], relevant_docs)
    e_tf_fmeasure = calculate_fmeasure(e_tf_precision, e_tf_recall)
    e_tfidf_precision = calculate_precision(tfidf_result[0][:N], relevant_docs)
    e_tfidf_recall = calculate_recall(tfidf_result[0][:N], relevant_docs)
    e_tfidf_fmeasure = calculate_fmeasure(e_tfidf_precision, e_tfidf_recall)
    
    c_binary_precision = calculate_precision(binary_result[1][:N], relevant_docs)
    c_binary_recall = calculate_recall(binary_result[1][:N], relevant_docs)
    c_binary_fmeasure = calculate_fmeasure(c_binary_precision, c_binary_recall)
    c_tf_precision = calculate_precision(tf_result[1][:N], relevant_docs)
    c_tf_recall = calculate_recall(tf_result[1][:N], relevant_docs)
    c_tf_fmeasure = calculate_fmeasure(c_tf_precision, c_tf_recall)
    c_tfidf_precision = calculate_precision(tfidf_result[1][:N], relevant_docs)
    c_tfidf_recall = calculate_recall(tfidf_result[1][:N], relevant_docs)
    c_tfidf_fmeasure = calculate_fmeasure(c_tfidf_precision, c_tfidf_recall)
    
    res = [e_binary_precision, e_binary_recall, e_binary_fmeasure, e_tf_precision, e_tf_recall, e_tf_fmeasure, e_tfidf_precision, e_tfidf_recall,
                e_tfidf_fmeasure, c_binary_precision, c_binary_recall,  c_binary_fmeasure,  c_tf_precision, c_tf_recall, c_tf_fmeasure, 
                c_tfidf_precision, c_tfidf_recall, c_tfidf_fmeasure]
    
    with open('standard_output.csv', 'a') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(["{:.3f}".format(float(r)) for r in res]+['\n'])
        csvFile.close()
    
    results.append(res)
    
    query_id += 1
    

  new_tf_matrix = tf_matrix.multiply(1 / sum_x)


In [87]:
# Compute average
sum_values = [0] * 19
for res in results:
    for i in range(len(res)):
        sum_values[i] += res[i]
        
print("Average scores:")
print("-" * 80)
for i in range(len(header)-1):
    print(header[i] + " = {:.3f}".format(sum_values[i] / len(queries)))


Average scores:
--------------------------------------------------------------------------------
Euclidean binary precision = 0.015
Euclidean binary recall = 0.031
Euclidean binary F-measure = 0.018
Euclidean Term Frequency precision = 0.090
Euclidean Term Frequency recall = 0.182
Euclidean Term Frequency F-measure = 0.113
Euclidean TF-IDF precision = 0.179
Euclidean TF-IDF recall = 0.378
Euclidean TF-IDF F-measure = 0.227
Cosine binary precision = 0.118
Cosine binary recall = 0.260
Cosine binary F-measure = 0.152
Cosine Term Frequency precision = 0.106
Cosine Term Frequency recall = 0.223
Cosine Term Frequency F-measure = 0.134
Cosine TF-IDF precision = 0.188
Cosine TF-IDF recall = 0.395
Cosine TF-IDF F-measure = 0.238


## Using Hugging Face sentence similarity model

In [88]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
huggingface_results = []

# set up csv file
header = ['Euclidean precision', 'Euclidean recall', 'Euclidean F-measure', 'Cosine precision', 'Cosine recall', 'Cosine F-measure', '\n']
with open('huggingface_output.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(header)
    csvFile.close()   

query_id = 1
for query in queries:
    print("Query number " + str(query_id), end="\r")
    relevant_docs = get_relevant_documents(query_id)
    
    # Compute embeddings
    sentences = get_data(query)
    sentence_embeddings = model.encode(sentences)
    similarities = []
    doc_id = 1
    for s in sentence_embeddings[:-1]:
        # Compute differences
        # 'similarities' is a list of tuples. Each tuple contains: euclidean distance, cosine similarity, document id
        similarities.append((euclidean_distances(s.reshape(1, -1), sentence_embeddings[-1].reshape(1, -1)),
                             cosine_similarity(s.reshape(1, -1), sentence_embeddings[-1].reshape(1, -1)),
                             doc_id))   
        doc_id += 1
    
    # Order and select N most similar documents
    euclidean_res = sorted(similarities, key=lambda x: x[0], reverse=True)
    cosine_res = sorted(similarities, key=lambda x: x[1], reverse=True)
    euclidean_retrieved_docs = [d[2] for d in euclidean_res[:N]]
    cosine_retrieved_docs = [d[2] for d in cosine_res[:N]]
    
    # Compute scores
    e_p = calculate_precision(euclidean_retrieved_docs, relevant_docs)
    e_r = calculate_recall(euclidean_retrieved_docs, relevant_docs)
    e_f = calculate_fmeasure(e_p, e_r)
    c_p = calculate_precision(cosine_retrieved_docs, relevant_docs)
    c_r = calculate_recall(cosine_retrieved_docs, relevant_docs)
    c_f = calculate_fmeasure(c_p, c_r)
    
    # Save results
    huggingface_results.append([e_p, e_r, e_f, c_p, c_r, c_f])
    with open('huggingface_output.csv', 'a') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(["{:.3f}".format(float(r)) for r in [e_p, e_r, e_f, c_p, c_r, c_f]]+['\n'])
        csvFile.close()
    
    query_id += 1
    

Query number 225

In [90]:
# Compute average
sum_values = [0] * 6
for res in huggingface_results:
    for i in range(len(res)):
        sum_values[i] += res[i]

text = ["Euclidean Precision", "Euclidean Recall", "Euclidean F-Measure", "Cosine Precision", "Cosine Recall", "Cosine F-Measure"]
print("Average scores:")
print("-" * 80)
for t, r in zip(text, sum_values):
    print("{} = {:.3f}".format(t, r / len(queries)))


Average scores:
--------------------------------------------------------------------------------
Euclidean Precision = 0.001
Euclidean Recall = 0.001
Euclidean F-Measure = 0.001
Cosine Precision = 0.196
Cosine Recall = 0.421
Cosine F-Measure = 0.249


## Comments on results

#### Complete results can be found in the two csv files (standard_output.csv for the first implementation and huggingface_output.csv for the part using sentence similarity model from Hugging Face). The average scores can be found in the two png images (standard.png and huggingface.png). As explained during the lecture the cosine similarity measure performed way better for all the models, except for tfidf where the difference is barely noticeable. Comparing the three different 'base' models, tfidf perfomed much better than the prevoius two, both with euclidean distance and cosine similarity. We can also notice that with cosine similarity the difference between binary representation and term frequency is minimal, while with euclidean distance term frequency performed better than binary. Anyway, as I said before, tfidf perfomed better than both on them. Regarding the sentence similarity model from Hugging Face, it perfomed better than all of the previous models when we use cosine similarity, even if the difference with tfidf is negligible. On the other hand, results with euclidean distance were very poor. 

## Issues and possible extensions

#### I didn't have particular issues during this homework, but I expected better results, especially with the Hugging Face model. It also took a lot of time to do the task (almost three hours) so I expected better scores. As an extension, I would look for better models or better ways to encode the documents in order to obtain better results.