In [1]:
from mceclib.preprocessing import *
from mceclib.jaccard import *
from mceclib.tfidf import *
from mceclib.bm25 import *
from mceclib.evaluate import *

In [2]:
import gzip
import json
import pandas as pd

path = './data/'
with gzip.open(path + 'corpus.jsonl.gz', 'rt', encoding='utf-8') as file:
    lines = file.readlines()
data = [json.loads(line) for line in lines]
corpus = pd.DataFrame(data)
corpus = corpus[['_id','text']]

In [3]:
type(corpus)

pandas.core.frame.DataFrame

In [38]:
corpus_text = corpus["text"]

In [41]:
corpus_text

0       You don’t have to be vegetarian to be green. M...
1       Being vegetarian helps the environment  Becomi...
2       The key to good health is a balanced diet, not...
3       It is immoral to kill animals  As evolved huma...
4       There is a great moral difference between huma...
                              ...                        
8669    Stories about ridiculous administration costs ...
8670    We need to address the causes of poverty rathe...
8671    Sponsorship is often more about the intentions...
8672    Sponsorship is an inefficient way of giving to...
8673    Many of the organisations that run child spons...
Name: text, Length: 8674, dtype: object

In [39]:
type(corpus_text)

pandas.core.series.Series

In [42]:
corpus_clean = corpus_text.apply(preprocessing2text)

In [43]:
corpus_clean

0       vegetarian green mani special environ creat li...
1       vegetarian help environ becom vegetarian envir...
2       key good health balanc diet meat fishfre diet ...
3       immor kill anim evolv human be moral duti infl...
4       great moral differ human anim unlik anim human...
                              ...                        
8669    stori ridicul administr cost rare often untru ...
8670    need address caus poverti rather treat symptom...
8671    sponsorship often intent donor rather need poo...
8672    sponsorship ineffici way give chariti sponsor ...
8673    mani organis run child sponsorship scheme dedi...
Name: text, Length: 8674, dtype: object

In [7]:
type(corpus_clean)

pandas.core.series.Series

In [44]:
corpus_clean_tokenized = corpus_clean.apply(tokenize_text)

In [45]:
corpus_clean_tokenized

0       [vegetarian, green, mani, special, environ, cr...
1       [vegetarian, help, environ, becom, vegetarian,...
2       [key, good, health, balanc, diet, meat, fishfr...
3       [immor, kill, anim, evolv, human, be, moral, d...
4       [great, moral, differ, human, anim, unlik, ani...
                              ...                        
8669    [stori, ridicul, administr, cost, rare, often,...
8670    [need, address, caus, poverti, rather, treat, ...
8671    [sponsorship, often, intent, donor, rather, ne...
8672    [sponsorship, ineffici, way, give, chariti, sp...
8673    [mani, organis, run, child, sponsorship, schem...
Name: text, Length: 8674, dtype: object

In [9]:
type(corpus_clean_tokenized)

pandas.core.series.Series

# Jacard
* corpus_clean_tokenized
* corpus_original

In [10]:
query_jacard = "harmful"

In [13]:
result_jacard = search_jaccard(query_jacard, corpus_clean_tokenized, corpus)

In [14]:
result_jacard

Unnamed: 0,id,text,scores
0,training-economy-befhwimsc-pro03b,It is equally likely that money is a significa...,0.100000
1,training-religion-msfhbmoi-con03b,If marriage’s main function is to protect agai...,0.071429
2,test-environment-assgbatj-pro05b,There is a moral difference between harm for t...,0.062500
3,test-science-eassgbatj-pro05b,There is a moral difference between harm for t...,0.062500
4,validation-education-egpsthwtj-pro02b,When homework does take up time in class it is...,0.062500
...,...,...,...
8669,validation-society-gfhbcimrst-con03b,Whilst the Indian government may have policies...,0.000000
8670,validation-society-gfhbcimrst-con01b,Our policy provides far more than these existi...,0.000000
8671,validation-society-gfhbcimrst-con02a,Commodifying women. Surely providing a financ...,0.000000
8672,validation-society-gfhbcimrst-con05a,Autonomy (Please note that this argument canno...,0.000000


In [15]:
simple_result_jacard = simple_search_jaccard(query_jacard, corpus_clean_tokenized)
simple_result_jacard

[(np.int64(3908), np.float64(0.1)),
 (np.int64(7416), np.float64(0.07142857142857142)),
 (np.int64(20), np.float64(0.0625)),
 (np.int64(8320), np.float64(0.0625)),
 (np.int64(2526), np.float64(0.0625))]

# TF-IDF
* corpus_clean
* corpus_original

In [16]:
query_tfidf = "harmful"

In [17]:
matriz_tfidf, vectorizer = build_tfidf(corpus_clean)

In [18]:
result_tfidf = search_tdifd(query_tfidf, vectorizer, matriz_tfidf, corpus)

In [19]:
result_tfidf

Unnamed: 0,id,text,scores
0,test-environment-assgbatj-pro05b,There is a moral difference between harm for t...,0.433718
1,test-science-eassgbatj-pro05b,There is a moral difference between harm for t...,0.433718
2,training-law-ucgllghwbg-con01b,Gambling is a harmful activity and could have ...,0.413519
3,training-health-ahgshbcsbl-pro01b,"While individual liberty is an important good,...",0.388738
4,training-free-speech-debate-nshbbhnpsb-pro03a,People have a right to blaspheme In the laws ...,0.381923
...,...,...,...
8669,validation-society-gfhbcimrst-con03b,Whilst the Indian government may have policies...,0.000000
8670,validation-society-gfhbcimrst-con01b,Our policy provides far more than these existi...,0.000000
8671,validation-society-gfhbcimrst-con02a,Commodifying women. Surely providing a financ...,0.000000
8672,validation-society-gfhbcimrst-con05a,Autonomy (Please note that this argument canno...,0.000000


In [20]:
simple_result_tfidf = simple_search_tdifd(query_tfidf, vectorizer, matriz_tfidf)
simple_result_tfidf

[(np.int64(2526), np.float64(0.433718255790206)),
 (np.int64(20), np.float64(0.433718255790206)),
 (np.int64(5875), np.float64(0.41351934108835353)),
 (np.int64(3147), np.float64(0.38873786624641904)),
 (np.int64(3421), np.float64(0.3819232115506393))]

# BM25
* corpus_clean_tokenized
* corpus_original

In [21]:
query_bm25 = "daughter spiderman"

In [22]:
bm25_model = build_bm25_model(corpus_clean_tokenized)

In [23]:
result_bm25 = search_bm25(query_bm25, bm25_model, corpus)

In [24]:
result_bm25

Unnamed: 0,id,text,scores
0,training-society-gfyhbprcsao-pro01a,The one child policy skews gender demographics...,9.938928
1,validation-digital-freedoms-phbphnrp-con02a,Fixating on personal lives results in infringi...,9.019144
2,training-digital-freedoms-phbplpfsbop-pro03a,No clear dividing line between public and priv...,8.761588
3,training-philosophy-lsgsgfhbpsb-con01b,It is for the individuals to decide whether th...,7.415137
4,test-free-speech-debate-radhbsshr-con04b,The painting should remain hanging as a remind...,7.369124
...,...,...,...
8669,validation-society-gfhbcimrst-con03b,Whilst the Indian government may have policies...,0.000000
8670,validation-society-gihbsosbcg-con02a,This policy breaks down important inter-govern...,0.000000
8671,validation-society-gfhbcimrst-con02a,Commodifying women. Surely providing a financ...,0.000000
8672,validation-society-gfhbcimrst-con05a,Autonomy (Please note that this argument canno...,0.000000


In [25]:
simple_result_bm25 = simple_search_bm25(query_bm25, bm25_model)
simple_result_bm25

[(np.int64(7627), np.float64(9.938927915397242)),
 (np.int64(8536), np.float64(9.019144377771683)),
 (np.int64(7275), np.float64(8.76158789849452)),
 (np.int64(5379), np.float64(7.415137149987788)),
 (np.int64(395), np.float64(7.369124281079814))]

---

# Qrels

In [26]:
with gzip.open(path + 'train.jsonl.gz', 'rt', encoding='utf-8') as file:
    lines = file.readlines()
queriesdt = [json.loads(line) for line in lines]

qrels = pd.DataFrame(queriesdt)
qrels = qrels[['_id','text', 'query']].head(100)
qrels

Unnamed: 0,_id,text,query
0,test-environment-aeghhgwpe-pro02b,You don’t have to be vegetarian to be green. M...,is it better to be a vegan or vegetarian?
1,test-environment-aeghhgwpe-pro02b,You don’t have to be vegetarian to be green. M...,is meat environmentally harmful
2,test-environment-aeghhgwpe-pro02b,You don’t have to be vegetarian to be green. M...,is it good to be vegetarian
3,test-environment-aeghhgwpe-pro02a,Being vegetarian helps the environment Becomi...,how does meat affect the environment
4,test-environment-aeghhgwpe-pro02a,Being vegetarian helps the environment Becomi...,what is the most important driver for deforest...
...,...,...,...
95,test-environment-assgbatj-con01a,Animals don’t have human rights Humans have l...,are humans animals
96,test-environment-assgbatj-con04b,Not every country has laws like the EU or the ...,is it bad to use animal testing
97,test-environment-assgbatj-con04b,Not every country has laws like the EU or the ...,can the us ban animal testing?
98,test-environment-assgbatj-con04b,Not every country has laws like the EU or the ...,should i ban testing animals


Diccionario de qrels

In [27]:
qrels_dict= qrels.groupby('query')['_id'].apply(list).to_dict()
qrels_dict

{'adn natural selection definition in the world': ['test-environment-aeghhgwpe-con03a'],
 'animal tests': ['test-environment-assgbatj-con03b'],
 'are humans animals': ['test-environment-assgbatj-con01a'],
 'are humans omnivores': ['test-environment-aeghhgwpe-con01b'],
 'are meats bad for you': ['test-environment-aeghhgwpe-pro03a'],
 'are zoonotic diseases transmitted by animal products': ['test-environment-aeghhgwpe-pro04b'],
 'benefits of vegetarian diet': ['test-environment-aeghhgwpe-pro03a'],
 'can a scientist test animals': ['test-environment-assgbatj-con05b'],
 'can the government ban animal testing': ['test-environment-assgbatj-con05a'],
 'can the us ban animal testing?': ['test-environment-assgbatj-con04b'],
 'can vegetarians eat a lot of iron': ['test-environment-aeghhgwpe-con02b'],
 'can vegetarians eat cereal': ['test-environment-aeghhgwpe-con02b'],
 'can vegetarians have iron': ['test-environment-aeghhgwpe-con02b'],
 'can we ban animal testing': ['test-environment-assgbatj-p

---

# MAP y AP

In [28]:
evaluacion_map_jaccard = calcular_map(qrels_dict, search_function=search_jaccard, corpus_preprocessing=corpus_clean_tokenized, corpus=corpus)

print(f"MAP de todo el sistema (promedio de 4 APs): {evaluacion_map_jaccard['MAP']}\n")
print("APs individuales:")
for query, ap_score in evaluacion_map_jaccard['APs_individuales'].items():
    print(f"  '{query}': AP = {ap_score:.4f}")

MAP de todo el sistema (promedio de 4 APs): 0.28449131856565063

APs individuales:
  'adn natural selection definition in the world': AP = 0.0500
  'animal tests': AP = 0.0588
  'are humans animals': AP = 0.1000
  'are humans omnivores': AP = 0.1667
  'are meats bad for you': AP = 0.0036
  'are zoonotic diseases transmitted by animal products': AP = 0.2000
  'benefits of vegetarian diet': AP = 0.0185
  'can a scientist test animals': AP = 0.5000
  'can the government ban animal testing': AP = 0.0051
  'can the us ban animal testing?': AP = 1.0000
  'can vegetarians eat a lot of iron': AP = 0.2000
  'can vegetarians eat cereal': AP = 0.2500
  'can vegetarians have iron': AP = 0.2500
  'can we ban animal testing': AP = 0.0167
  'can you use animal testing?': AP = 0.0500
  'charles darwin on the origin of species by means of natural selection': AP = 1.0000
  'could we ban animal testing': AP = 0.5000
  'deforestation impacts animals environment': AP = 0.0022
  'do animals need to be slaug

In [29]:
evaluacion_map_tfidf = calcular_map(qrels_dict, search_function=search_tdifd, vectorizer=vectorizer, tfidf_matrix=matriz_tfidf, corpus=corpus)

print(f"MAP de todo el sistema (promedio de 4 APs): {evaluacion_map_tfidf['MAP']}\n")
print("APs individuales:")
for query, ap_score in evaluacion_map_tfidf['APs_individuales'].items():
    print(f"  '{query}': AP = {ap_score:.4f}")

MAP de todo el sistema (promedio de 4 APs): 0.34271185788501984

APs individuales:
  'adn natural selection definition in the world': AP = 0.2500
  'animal tests': AP = 0.0345
  'are humans animals': AP = 0.0625
  'are humans omnivores': AP = 1.0000
  'are meats bad for you': AP = 0.1250
  'are zoonotic diseases transmitted by animal products': AP = 1.0000
  'benefits of vegetarian diet': AP = 0.3333
  'can a scientist test animals': AP = 0.0145
  'can the government ban animal testing': AP = 0.0714
  'can the us ban animal testing?': AP = 0.1667
  'can vegetarians eat a lot of iron': AP = 1.0000
  'can vegetarians eat cereal': AP = 1.0000
  'can vegetarians have iron': AP = 1.0000
  'can we ban animal testing': AP = 0.0095
  'can you use animal testing?': AP = 0.1250
  'charles darwin on the origin of species by means of natural selection': AP = 1.0000
  'could we ban animal testing': AP = 0.5000
  'deforestation impacts animals environment': AP = 0.0149
  'do animals need to be slaug

In [31]:
evaluacion_map_bm25 = calcular_map(qrels_dict, search_function=search_bm25, bm25_model=bm25_model, corpus=corpus)

print(f"MAP de todo el sistema (promedio de 4 APs): {evaluacion_map_bm25['MAP']}\n")
print("APs individuales:")
for query, ap_score in evaluacion_map_bm25['APs_individuales'].items():
    print(f"  '{query}': AP = {ap_score:.4f}")

MAP de todo el sistema (promedio de 4 APs): 0.45255116533020884

APs individuales:
  'adn natural selection definition in the world': AP = 0.3333
  'animal tests': AP = 0.0769
  'are humans animals': AP = 0.0769
  'are humans omnivores': AP = 1.0000
  'are meats bad for you': AP = 0.1667
  'are zoonotic diseases transmitted by animal products': AP = 1.0000
  'benefits of vegetarian diet': AP = 0.3333
  'can a scientist test animals': AP = 0.0357
  'can the government ban animal testing': AP = 0.0189
  'can the us ban animal testing?': AP = 0.3333
  'can vegetarians eat a lot of iron': AP = 1.0000
  'can vegetarians eat cereal': AP = 1.0000
  'can vegetarians have iron': AP = 1.0000
  'can we ban animal testing': AP = 0.0087
  'can you use animal testing?': AP = 0.0588
  'charles darwin on the origin of species by means of natural selection': AP = 1.0000
  'could we ban animal testing': AP = 0.5000
  'deforestation impacts animals environment': AP = 0.2000
  'do animals need to be slaug

---

#  Precision y Recall

In [32]:
qrels_dict

{'adn natural selection definition in the world': ['test-environment-aeghhgwpe-con03a'],
 'animal tests': ['test-environment-assgbatj-con03b'],
 'are humans animals': ['test-environment-assgbatj-con01a'],
 'are humans omnivores': ['test-environment-aeghhgwpe-con01b'],
 'are meats bad for you': ['test-environment-aeghhgwpe-pro03a'],
 'are zoonotic diseases transmitted by animal products': ['test-environment-aeghhgwpe-pro04b'],
 'benefits of vegetarian diet': ['test-environment-aeghhgwpe-pro03a'],
 'can a scientist test animals': ['test-environment-assgbatj-con05b'],
 'can the government ban animal testing': ['test-environment-assgbatj-con05a'],
 'can the us ban animal testing?': ['test-environment-assgbatj-con04b'],
 'can vegetarians eat a lot of iron': ['test-environment-aeghhgwpe-con02b'],
 'can vegetarians eat cereal': ['test-environment-aeghhgwpe-con02b'],
 'can vegetarians have iron': ['test-environment-aeghhgwpe-con02b'],
 'can we ban animal testing': ['test-environment-assgbatj-p

In [46]:
query_general = 'benefits of vegetarian diet'
ids_relevantes_ejemplo = set(qrels_dict[query_general])

In [47]:
result_jacard = search_jaccard(query_general, corpus_clean_tokenized, corpus)
result_tfidf = search_tdifd(query_general, vectorizer, matriz_tfidf, corpus)
result_bm25 = search_bm25(query_general, bm25_model, corpus)

In [48]:
metricas = calcular_precision_recall(result_jacard, ids_relevantes_ejemplo, k=10)
print(f"Precisión@: {metricas['precision@k']:.4f}, Recall@: {metricas['recall@k']:.4f}")

Precisión@: 0.0000, Recall@: 0.0000


In [49]:
metricas = calcular_precision_recall(result_tfidf, ids_relevantes_ejemplo, k=10)
print(f"Precisión@: {metricas['precision@k']:.4f}, Recall@: {metricas['recall@k']:.4f}")

Precisión@: 0.1000, Recall@: 1.0000


In [50]:
metricas = calcular_precision_recall(result_bm25, ids_relevantes_ejemplo, k=10)
print(f"Precisión@: {metricas['precision@k']:.4f}, Recall@: {metricas['recall@k']:.4f}")

Precisión@: 0.1000, Recall@: 1.0000
