In [34]:
# SETUP
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay
import numpy as np
import json
import requests
import pandas as pd

QRELS_FILE = "../query_qrels/query_1.txt"
QUERY_URL = "http://localhost:8983/solr/medicines/select?defType=lucene&indent=true&q.op=OR&q=(Antes_de_utilizar%3Aamamentar~3%20AND%20(Antes_de_utilizar%3A%22amamentação%20Não%22~4))%20OR%20(Antes_de_utilizar%3Aaleitamento%20AND%20(Antes_de_utilizar%3A%22aleitamento%20Não%22~4))%20OR%20(Antes_de_utilizar%3Agravidez%20AND%20(Antes_de_utilizar%3A%22grávida%20Não%22%5E2~4))&useParams=%3Djson&wt=json" #url query

In [35]:
# Read qrels to extract relevant documents
relevant = list(map(lambda el: el.strip(), open(QRELS_FILE).readlines()))
# Get query results from Solr instance
results = requests.get(QUERY_URL).json()['response']['docs']

display(relevant)
print(results)

['Aspifox',
 'Amlodipina Basi',
 'Dabigatrano Etexilato Stada',
 'Daflon 1000',
 'Claritromicina Azevedos',
 'Claritromicina Azevedos',
 'Agomelatina Mylan',
 'Amiodarona Mylan',
 'Colroset',
 'Clopidogrel Farmoz',
 'Arankelle',
 'Dorcilfree',
 'Efavirenz Aurobindo',
 'Clopidogrel Lumec',
 'Actiq',
 'Dexmedetomidina Kabi',
 'Dexmedetomidina Kalceks',
 'Amlodipina + Valsartan Tolife',
 'Dabigatrano Etexilato Pharmacons']

[{'Product_name': ['Dexmedetomidina Kalceks'], 'Active_substance': 'Dexmedetomidine', 'Route_of_administration': 'Intravenous Use', 'Product_authorisation_country': 'Portugal', 'Marketing_authorisation_holder': 'Kalceks', 'Pharmacovigilance_system_master_file_location': 'Latvia', 'Pharmacovigilance_enquiries_email_address': 'vigilance@grindeks.lv', 'Pharmacovigilance_enquiries_telephone_number': '+37122038850', 'Lowest_PVP': 'Not Available', 'Substancia_Ativa_DCI': 'Dexmedetomidina', 'Forma_Farmaceutica': 'Concentrado para solução para perfusão', 'Dosagem': '100 µg/ml', 'Titular_de_AIM': 'Kalceks, AS', 'Generico': 'Sim', 'Vias_de_Administracao': 'Via intravenosa', 'Grupo_de_Produto': 'Genérico', 'Numero_de_Processo': 'DK/H/2891/001/E01', 'AIM': 'Autorizado', 'Data': '17/02/2020', 'Classificacao_Quanto_a_Dispensa': 'MSRM restrita - Alínea a)', 'Duracao_do_Tratamento': 'Curta ou Média Duração', 'O_que_e_e_para_que_e_utilizado': 'O que é Dexmedetomidina Kalceks e para que é utilizado\nDex

In [None]:
# METRICS TABLE
# Define custom decorator to automatically calculate metric based on key
metrics = {}
metric = lambda f: metrics.setdefault(f.__name__, f)

@metric
def ap(results, relevant):
    """Average Precision"""
    precision_values = []
    relevant_count = 0

    for idx, doc in enumerate(results):
        if doc['Product_name'][0] in relevant:
            relevant_count += 1
            precision_at_k = relevant_count / (idx + 1)
            precision_values.append(precision_at_k)

    if not precision_values:
        return 0.0

    return sum(precision_values)/len(precision_values)

@metric
def p10(results, relevant, n=10):
    """Precision at N"""
    return len([doc for doc in results[:n] if doc['Product_name'][0] in relevant])/n

def calculate_metric(key, results, relevant):
    return metrics[key](results, relevant)

# Define metrics to be calculated
evaluation_metrics = {
    'ap': 'Average Precision',
    'p10': 'Precision at 10 (P@10)'
}

In [37]:
# Calculate all metrics and export results as LaTeX table
df = pd.DataFrame([['Metric','Value']] +
    [
        [evaluation_metrics[m], calculate_metric(m, results, relevant)]
        for m in evaluation_metrics
    ]
)

with open('results.tex','w') as tf:
    tf.write(df.to_latex())


In [None]:
# PRECISION-RECALL CURVE
# Calculate precision and recall values as we move down the ranked list
precision_values = [
    len([
        doc 
        for doc in results[:idx]
        if doc['Product_name'][0] in relevant
    ]) / idx 
    for idx, _ in enumerate(results, start=1)
]

recall_values = [
    len([
        doc for doc in results[:idx]
        if doc['Product_name'][0] in relevant
    ]) / len(relevant)
    for idx, _ in enumerate(results, start=1)
]

precision_recall_match = {k: v for k,v in zip(recall_values, precision_values)}

# Extend recall_values to include traditional steps for a better curve (0.1, 0.2 ...)
recall_values.extend([step for step in np.arange(0.1, 1.1, 0.1) if step not in recall_values])
recall_values = sorted(set(recall_values))

# Extend matching dict to include these new intermediate steps
for idx, step in enumerate(recall_values):
    if step not in precision_recall_match:
        if recall_values[idx-1] in precision_recall_match:
            precision_recall_match[step] = precision_recall_match[recall_values[idx-1]]
        else:
            precision_recall_match[step] = precision_recall_match[recall_values[idx+1]]

disp = PrecisionRecallDisplay([precision_recall_match.get(r) for r in recall_values], recall_values)
disp.plot()
plt.savefig('precision_recall.pdf')
