In [1]:
from mceclib.preprocessing import *
from mceclib.jaccard import *
from mceclib.tfidf import *
from mceclib.bm25 import *

In [2]:
import gzip
import json
import pandas as pd

path = './data/'

# Leer archivo comprimido .jsonl.gz
with gzip.open(path + 'corpus.jsonl.gz', 'rt', encoding='utf-8') as file:
    lines = file.readlines()

# Procesar el archivo línea por línea y convertirlo en un DataFrame
data = [json.loads(line) for line in lines]

# Convertir en DataFrame
df = pd.DataFrame(data)
df[['_id','text']]

Unnamed: 0,_id,text
0,test-environment-aeghhgwpe-pro02b,You don’t have to be vegetarian to be green. M...
1,test-environment-aeghhgwpe-pro02a,Being vegetarian helps the environment Becomi...
2,test-environment-aeghhgwpe-pro03b,"The key to good health is a balanced diet, not..."
3,test-environment-aeghhgwpe-pro01a,It is immoral to kill animals As evolved huma...
4,test-environment-aeghhgwpe-pro01b,There is a great moral difference between huma...
...,...,...
8669,validation-society-fyhwscdcj-con01b,Stories about ridiculous administration costs ...
8670,validation-society-fyhwscdcj-con02a,We need to address the causes of poverty rathe...
8671,validation-society-fyhwscdcj-con03a,Sponsorship is often more about the intentions...
8672,validation-society-fyhwscdcj-con01a,Sponsorship is an inefficient way of giving to...


In [3]:
corpus_original = df[['_id','text']]

In [4]:
type(corpus_original)

pandas.core.frame.DataFrame

In [5]:
corpus_clean = corpus_original["text"].apply(preprocessing2text)
corpus_clean_df = pd.DataFrame({
    '_id': corpus_original['_id'],
    'textprocesado': corpus_clean
})

In [6]:
corpus_clean_df

Unnamed: 0,_id,textprocesado
0,test-environment-aeghhgwpe-pro02b,vegetarian green mani special environ creat li...
1,test-environment-aeghhgwpe-pro02a,vegetarian help environ becom vegetarian envir...
2,test-environment-aeghhgwpe-pro03b,key good health balanc diet meat fishfre diet ...
3,test-environment-aeghhgwpe-pro01a,immor kill anim evolv human be moral duti infl...
4,test-environment-aeghhgwpe-pro01b,great moral differ human anim unlik anim human...
...,...,...
8669,validation-society-fyhwscdcj-con01b,stori ridicul administr cost rare often untru ...
8670,validation-society-fyhwscdcj-con02a,need address caus poverti rather treat symptom...
8671,validation-society-fyhwscdcj-con03a,sponsorship often intent donor rather need poo...
8672,validation-society-fyhwscdcj-con01a,sponsorship ineffici way give chariti sponsor ...


In [7]:
corpus_clean_tokenized = corpus_clean.apply(tokenize_text)
corpus_clean_tokenized_df = pd.DataFrame({
    '_id': corpus_original['_id'],
    'tokens': corpus_clean_tokenized
})
corpus_clean_tokenized_df

Unnamed: 0,_id,tokens
0,test-environment-aeghhgwpe-pro02b,"[vegetarian, green, mani, special, environ, cr..."
1,test-environment-aeghhgwpe-pro02a,"[vegetarian, help, environ, becom, vegetarian,..."
2,test-environment-aeghhgwpe-pro03b,"[key, good, health, balanc, diet, meat, fishfr..."
3,test-environment-aeghhgwpe-pro01a,"[immor, kill, anim, evolv, human, be, moral, d..."
4,test-environment-aeghhgwpe-pro01b,"[great, moral, differ, human, anim, unlik, ani..."
...,...,...
8669,validation-society-fyhwscdcj-con01b,"[stori, ridicul, administr, cost, rare, often,..."
8670,validation-society-fyhwscdcj-con02a,"[need, address, caus, poverti, rather, treat, ..."
8671,validation-society-fyhwscdcj-con03a,"[sponsorship, often, intent, donor, rather, ne..."
8672,validation-society-fyhwscdcj-con01a,"[sponsorship, ineffici, way, give, chariti, sp..."


# Jacard
* corpus_clean_tokenized
* corpus_original

In [8]:
query_jacard = "daughter spiderman"

In [10]:
result_jacard = search_jaccard(query_jacard, corpus_clean_tokenized_df, corpus_original)

ValueError: too many values to unpack (expected 2)

In [30]:
result_jacard

Unnamed: 0,doc_index,reviews,scores
0,5379,It is for the individuals to decide whether th...,0.020000
1,395,The painting should remain hanging as a remind...,0.019231
2,5374,This argument veils the likely result of the p...,0.014925
3,7275,No clear dividing line between public and priv...,0.014925
4,8636,Abortion It is estimated that around 10 milli...,0.012987
...,...,...,...
8669,8637,Whilst the Indian government may have policies...,0.000000
8670,8655,This policy breaks down important inter-govern...,0.000000
8671,8639,Commodifying women. Surely providing a financ...,0.000000
8672,8640,Autonomy (Please note that this argument canno...,0.000000


# TF-IDF
* corpus_clean
* corpus_original

In [31]:
query_tfidf = "daughter spiderman"

In [32]:
matriz_tfidf, vectorizer = build_tfidf(corpus_clean)

In [33]:
result_tfidf = search_tdifd(query_tfidf, vectorizer, matriz_tfidf, corpus_original)

In [34]:
result_tfidf

Unnamed: 0,doc_index,reviews,scores
0,7627,The one child policy skews gender demographics...,0.208172
1,8536,Fixating on personal lives results in infringi...,0.176949
2,7275,No clear dividing line between public and priv...,0.141395
3,395,The painting should remain hanging as a remind...,0.107502
4,5379,It is for the individuals to decide whether th...,0.102044
...,...,...,...
8669,8637,Whilst the Indian government may have policies...,0.000000
8670,8655,This policy breaks down important inter-govern...,0.000000
8671,8639,Commodifying women. Surely providing a financ...,0.000000
8672,8640,Autonomy (Please note that this argument canno...,0.000000


In [1]:
simple_result_tfidf = simple_search_tdifd(query_tfidf,vectorizer, matriz_tfidf, corpus_original)
simple_result_tfidf

NameError: name 'simple_search_tdifd' is not defined

# BM25
* corpus_clean_tokenized
* corpus_original

In [35]:
query_bm25 = "daughter spiderman"

In [36]:
bm25_model = build_bm25_model(corpus_clean_tokenized)

In [37]:
result_bm25 = search_bm25(query_bm25, bm25_model, corpus_original)

In [38]:
result_bm25

Unnamed: 0,doc_index,reviews,scores
0,7627,The one child policy skews gender demographics...,9.938928
1,8536,Fixating on personal lives results in infringi...,9.019144
2,7275,No clear dividing line between public and priv...,8.761588
3,5379,It is for the individuals to decide whether th...,7.415137
4,395,The painting should remain hanging as a remind...,7.369124
...,...,...,...
8669,8637,Whilst the Indian government may have policies...,0.000000
8670,8655,This policy breaks down important inter-govern...,0.000000
8671,8639,Commodifying women. Surely providing a financ...,0.000000
8672,8640,Autonomy (Please note that this argument canno...,0.000000


# qrels

In [2]:
path = './data/'

# Leer archivo comprimido .jsonl.gz
with gzip.open(path + 'train.jsonl.gz', 'rt', encoding='utf-8') as file:
    lines = file.readlines()

# Procesar el archivo línea por línea y convertirlo en un DataFrame
queriesdt = [json.loads(line) for line in lines]

# Convertir en DataFrame
queries = pd.DataFrame(queriesdt)
queries[['_id','text', 'query']]


NameError: name 'gzip' is not defined

In [66]:
qrels_dict= queries[['_id','text', 'query']].groupby('query')['_id'].apply(list).to_dict()

In [68]:
qrels_dict

{"'you are an attorney'": ['test-law-lghbacpsba-pro04a'],
 '2% of non-voters say it is good to have two parties.': ['training-politics-viwvfpps-con04a'],
 "Assad's government what does it do": ['training-international-mepdwhwhwi-con04b'],
 'Jeremy Corbyn what is his political agenda': ['training-politics-yppplhbmlgl-pro03a'],
 '____ is the right of being freedomd from the government without endangering human life.': ['training-philosophy-iilepphbf-pro01b'],
 "________ is a type of assimilation that is inherently based on a nation's heritage.": ['training-culture-tphwpac-con02b'],
 '________.does not exist in the era of invention': ['training-science-cidfiphwa-pro05b'],
 'a common misunderstanding between the public and the private sector is that the government': ['training-economy-eptppppgh-pro01b'],
 'a country should not be burdened by a country': ['test-politics-pgsimhwoia-pro01b'],
 'a gay couple can be compelled by law to testify or provide evidence against one another.': ['traini