In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
from collections import defaultdict
import csv
from dataclasses import dataclass
import gzip
import itertools
import os
from pathlib import Path
from pprint import pprint
import re
import sys
from typing import Optional
if '..' not in sys.path: sys.path.append('..')

import numpy as np
import pandas as pd
from tqdm import tqdm

import mteb

In [3]:
DATA_PATH = Path(os.path.expandvars('$HOME')) / 'data'
MTEB_PATH = DATA_PATH / 'mteb'

In [4]:
tasks = mteb.get_tasks(task_types=['Retrieval'])
for task in tasks:
    print(task)

CodeEditSearchRetrieval(name='CodeEditSearchRetrieval', languages=['c', 'c++', 'go', '...'])
CodeSearchNetRetrieval(name='CodeSearchNetRetrieval', languages=['go', 'java', 'javascript', '...'])
DanFeverRetrieval(name='DanFeverRetrieval', languages=['dan'])
TV2Nordretrieval(name='TV2Nordretrieval', languages=['dan'])
TwitterHjerneRetrieval(name='TwitterHjerneRetrieval', languages=['dan'])
GerDaLIR(name='GerDaLIR', languages=['deu'])
GerDaLIRSmall(name='GerDaLIRSmall', languages=['deu'])
GermanDPR(name='GermanDPR', languages=['deu'])
GermanGovServiceRetrieval(name='GermanGovServiceRetrieval', languages=['deu'])
GermanQuADRetrieval(name='GermanQuAD-Retrieval', languages=['deu'])
LegalQuAD(name='LegalQuAD', languages=['deu'])
GreekCivicsQA(name='GreekCivicsQA', languages=['ell'])
AILACasedocs(name='AILACasedocs', languages=['eng'])
AILAStatutes(name='AILAStatutes', languages=['eng'])
AlphaNLI(name='AlphaNLI', languages=['eng'])
ARCChallenge(name='ARCChallenge', languages=['eng'])
ArguAna(n

In [5]:
mteb.get_task('FEVER')

FEVER(name='FEVER', languages=['eng'])

In [6]:
from mteb import FEVER

In [7]:
fever = FEVER()
fever

FEVER(name='FEVER', languages=['eng'])

In [8]:
fever.load_data()

Map:   0%|          | 0/140085 [00:00<?, ? examples/s]

Map:   0%|          | 0/8079 [00:00<?, ? examples/s]

Map:   0%|          | 0/7937 [00:00<?, ? examples/s]

In [9]:
print('splits:', list(fever.corpus.keys()))
for split, docs in fever.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    doc_ids = itertools.islice(docs.keys(), 2)
    for doc_id in doc_ids:
        print(f'{split}. {doc_id}. {docs[doc_id]}')

splits: ['train', 'dev', 'test']
train. samples: 5416568
train. 1928_in_association_football. {'title': '1928 in association football', 'text': 'The following are the football ( soccer ) events of the year 1928 throughout the world .'}
train. 1986_NBA_Finals. {'title': '1986 NBA Finals', 'text': "The 1986 NBA Finals was the championship round of the 1985 -- 86 NBA season . It pitted the Eastern Conference champion Boston Celtics against the Western Conference champion Houston Rockets , in a rematch of the 1981 Finals ( only Allen Leavell and Robert Reid remained from the Rockets ' 1981 team ) . The Celtics defeated the Rockets four games to two to win their 16th NBA championship . The championship would be the Celtics ' last until the 2008 NBA Finals . Larry Bird was named the Finals MVP .   On another note , this series marked the first time the `` NBA Finals '' branding was officially used , as they dropped the `` NBA World Championship Series '' branding which had been in use since 

In [2]:
fever.corpus['dev'] = None
fever.corpus['test'] = None

NameError: name 'fever' is not defined

In [18]:
for split, qs in fever.queries.items():
    print(f'{split}. Queries: {len(qs)}')
    qids = itertools.islice(qs.keys(), 2)
    for qid in qids:
        print(f'{split}. {qid}. {qs[qid]}')

train. Queries: 109810
train. 75397. Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
train. 150448. Roman Atwood is a content creator.
dev. Queries: 6666
dev. 137334. Fox 2000 Pictures released the film Soul Food.
dev. 111897. Telemundo is a English-language television network.
test. Queries: 6666
test. 163803. Ukrainian Soviet Socialist Republic was a founding participant of the UN.
test. 70041. 2 Hearts is a musical composition by Minogue.


In [20]:
for split, reldocs in fever.relevant_docs.items():
    print(f'{split}. Reldocs: {len(reldocs)}')
    qids = itertools.islice(reldocs.keys(), 2)
    for qid in qids:
        print(f'{split}. {qid}. {reldocs[qid]}')

train. Reldocs: 109810
train. 75397. {'Fox_Broadcasting_Company': 1, 'Nikolaj_Coster-Waldau': 1}
train. 150448. {'Roman_Atwood': 1}
dev. Reldocs: 6666
dev. 137334. {'Soul_Food_(film)': 1}
dev. 111897. {'Telemundo': 1, 'Hispanic_and_Latino_Americans': 1}
test. Reldocs: 6666
test. 163803. {'Ukrainian_Soviet_Socialist_Republic': 1, 'United_Nations': 1}
test. 70041. {'2_Hearts_(Kylie_Minogue_song)': 1}


In [1]:
for split in 'train', 'dev', 'test':
    reldocs = fever.relevant_docs[split]
    max_sz = 0
    vals = set()
    for reldoc in reldocs.values():
        max_sz = max(max_sz, len(reldoc))
        for docid, val in reldoc.items():
            docid_src = docid
            if docid[0] == '"' and docid[-1] == '"':
                docid = docid[1:-1].replace('""', '"')
            assert docid in fever.corpus['train'], f'`{docid_src}` -> `{docid}`'
            vals.add(val)
    print(f'{split}. max_sz: {max_sz}. vals: {vals}')

NameError: name 'fever' is not defined

In [39]:
def proc_docid(docid: str) -> str:
    if docid[0] == '"' and docid[-1] == '"':
        docid = docid[1:-1].replace('""', '"')
    return docid

docs = fever.corpus['train']
cnt = defaultdict(lambda: 0)
for docid in docs:
    docid_p = proc_docid(docid)
    cnt[docid_p] += 1

for docid in docs:
    docid_p = proc_docid(docid)
    if cnt[docid_p] > 1:
        print(f'docid = `{docid}`. docid_p = `{docid_p}`')
        print(docs[docid])


"Thrive" Thrive
{'title': '"Thrive"', 'text': 'Thrive is a 2015 short documentary film about Matthew Whitaker , a piano prodigy who has been blind since birth . The film was directed and produced by Paul Szynol and has played at festivals worldwide , including North America , Europe , and Asia . The film includes appearances by Jonathan Batiste , Dr Lonnie Smith , and Chad Smith .'}
"It's_Alright" It's_Alright
{'title': '"It\'s Alright"', 'text': "It 's Alright , performed by Chanté Moore . A music video was made for the original/remix versions of the song ."}
"Awesome" Awesome
{'title': '"Awesome"', 'text': "Self-described as `` Part band , part art collective . '' While they reject the `` rock band '' label , Lane Czaplinski , artistic director of On the Boards remarks , `` If they are not rock musicians , `` there is rock payoff . '' Czaplinski has compared them to Polyphonic Spree   `` Awesome '' began as a cabaret act thrown together by seven experienced fringe theater actors . ``

In [27]:
'The_"Chirping"_Crickets' == 'The_"Chirping"_Crickets'

True

In [13]:
id(fever.corpus['train']) == id(fever.corpus['dev'])

False

In [None]:
def write_docs(docs: dict[str, dict[str, str]], fpath: Path) -> pd.DataFrame:
    n_docs = len(docs)
    pbar = tqdm(docs.keys(), total=n_docs, desc='Fever corpus dump', unit='doc')
    docidn, docoff = np.arange(n_docs), [0] * n_docs
    with open(fpath, 'w', encoding='utf-8') as f:
        for i, docid in enumerate(pbar):
            doc = docs[docid]
            off = f.tell()
            title, text = doc['title'], doc['text']
            f.write(f'{docidn[i]}\t{docid}\t{title}\t{text}\n')

In [24]:
fpath = DATA_PATH / 'temp' / 'lines.txt'
fpath.parent.mkdir(parents=True, exist_ok=True)
lines = [
    'abc',
    'def-hij',
    'Hi, General',
    'Nice to meet you, General',
    'Ho ho ho',
    'Glad to see you',
    'Glad to see you too',
    '',
    'mimimir'
]
offsets = []
with open(fpath, 'w', encoding='utf-8') as f:
    for line in lines:
        off = f.tell()
        f.write(line)
        f.write('\n')
        offsets.append(off)

In [25]:
off_calc = 0
for l, off in zip(lines, offsets):
    print(off_calc, off)
    print(l)
    off_calc += len(l) + 1

0 0
abc
4 4
def-hij
12 12
Hi, General
24 24
Nice to meet you, General
50 50
Ho ho ho
59 59
Glad to see you
75 75
Glad to see you too
95 95

96 96
mimimir


In [26]:
lo = list(zip(lines, offsets))
np.random.shuffle(lo)
with open(fpath, 'r', encoding='utf-8') as f:
    for l1, off in lo:
        f.seek(off)
        l2 = f.readline().rstrip()
        print(off, l1, '\n\t', l1 == l2)
    
    

59 Glad to see you 
	 True
12 Hi, General 
	 True
96 mimimir 
	 True
75 Glad to see you too 
	 True
0 abc 
	 True
50 Ho ho ho 
	 True
24 Nice to meet you, General 
	 True
95  
	 True
4 def-hij 
	 True


In [10]:
# for docid, doc in fever.corpus['train'].items():
#     assert '\t' not in doc['title'] and '\t' not in doc['text']

In [11]:
fever.corpus['train'] == fever.corpus['dev'] == fever.corpus['test']

True

In [9]:
fever.corpus['dev'] = None
fever.corpus['test'] = None
# del fever.corpus['dev']
# del fever.corpus['test']