In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
from collections import defaultdict
import csv
from dataclasses import dataclass
import gzip
import itertools
import os
from pathlib import Path
from pprint import pprint
import re
import sys
from typing import Optional
if '..' not in sys.path: sys.path.append('..')

import mteb
import numpy as np
import pandas as pd
from tqdm import tqdm

from mllm.utils.utils import write_tsv, read_tsv

# MTEB Datasets
## Common

In [15]:
DATA_PATH = Path(os.path.expandvars('$HOME')) / 'data'
MTEB_PATH = DATA_PATH / 'mteb'
FEVER_PATH = DATA_PATH / 'fever'
FEVER_PATH.mkdir(parents=True, exist_ok=True)
FEVER_QS_FPATH = FEVER_PATH / 'queries.tsv'
FEVER_QRELS_FPATH = FEVER_PATH / 'qrels.tsv'
FEVER_DOCS_FPATH = FEVER_PATH / 'docs.tsv'
FEVER_DOCS_OFF_FPATH = FEVER_PATH / 'docs_offsets.tsv'

In [16]:
tasks = mteb.get_tasks(task_types=['Retrieval'])
for task in tasks:
    print(task)

SadeemQuestionRetrieval(name='SadeemQuestionRetrieval', languages=['ara'])
AppsRetrieval(name='AppsRetrieval', languages=['eng', 'python'])
CodeEditSearchRetrieval(name='CodeEditSearchRetrieval', languages=['c', 'c++', 'go', '...'])
CodeFeedbackMT(name='CodeFeedbackMT', languages=['eng'])
CodeFeedbackST(name='CodeFeedbackST', languages=['eng'])
CodeSearchNetCCRetrieval(name='CodeSearchNetCCRetrieval', languages=['go', 'java', 'javascript', '...'])
CodeSearchNetRetrieval(name='CodeSearchNetRetrieval', languages=['go', 'java', 'javascript', '...'])
CodeTransOceanContestRetrieval(name='CodeTransOceanContest', languages=['c++', 'python'])
CodeTransOceanDLRetrieval(name='CodeTransOceanDL', languages=['python'])
COIRCodeSearchNetRetrieval(name='COIRCodeSearchNetRetrieval', languages=['go', 'java', 'javascript', '...'])
CosQARetrieval(name='CosQA', languages=['eng', 'python'])
StackOverflowQARetrieval(name='StackOverflowQA', languages=['eng'])
SyntheticText2SQLRetrieval(name='SyntheticText2SQ

In [5]:
langs = defaultdict(lambda: 0)
for task in tasks:
    for lang in task.languages:
        langs[lang] += 1
print([(lang, cnt) for (lang, cnt) in langs.items() if cnt >= 3])

[('ara', 9), ('eng', 92), ('python', 8), ('go', 4), ('java', 4), ('javascript', 4), ('php', 4), ('ruby', 4), ('dan', 5), ('deu', 18), ('ell', 3), ('fra', 15), ('jpn', 13), ('kor', 9), ('ben', 6), ('fin', 5), ('hin', 10), ('ind', 4), ('ita', 5), ('nob', 3), ('pol', 18), ('por', 5), ('ron', 3), ('rus', 16), ('slk', 3), ('spa', 13), ('swe', 4), ('tam', 3), ('tel', 5), ('tha', 6), ('tur', 3), ('vie', 5), ('yor', 3), ('zho', 13), ('fas', 9), ('swa', 3), ('cmn', 10)]


## Fever

In [10]:
from mteb import FEVER

In [11]:
fever = FEVER()
fever

FEVER(name='FEVER', languages=['eng'])

In [12]:
fever.load_data()

Downloading readme:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Map:   0%|          | 0/7937 [00:00<?, ? examples/s]

Filter:   0%|          | 0/123142 [00:00<?, ? examples/s]

In [10]:
print('splits:', list(fever.corpus.keys()))
for split, docs in fever.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        print(f'{split}. {doc_id}. {docs[doc_id]}')

splits: ['train', 'dev', 'test']
train. Docs: 5416568
train. 1928_in_association_football. {'title': '1928 in association football', 'text': 'The following are the football ( soccer ) events of the year 1928 throughout the world .'}
train. 1986_NBA_Finals. {'title': '1986 NBA Finals', 'text': "The 1986 NBA Finals was the championship round of the 1985 -- 86 NBA season . It pitted the Eastern Conference champion Boston Celtics against the Western Conference champion Houston Rockets , in a rematch of the 1981 Finals ( only Allen Leavell and Robert Reid remained from the Rockets ' 1981 team ) . The Celtics defeated the Rockets four games to two to win their 16th NBA championship . The championship would be the Celtics ' last until the 2008 NBA Finals . Larry Bird was named the Finals MVP .   On another note , this series marked the first time the `` NBA Finals '' branding was officially used , as they dropped the `` NBA World Championship Series '' branding which had been in use since the

In [11]:
assert fever.corpus['train'] == fever.corpus['dev'] == fever.corpus['test']
fever.corpus['dev'] = None
fever.corpus['test'] = None

In [12]:
for split, qs in fever.queries.items():
    print(f'{split}. Queries: {len(qs)}')
    qids = itertools.islice(qs.keys(), 2)
    for qid in qids:
        print(f'{split}. {qid}: {type(qid)}. {qs[qid]}')

train. Queries: 109810
train. 75397: <class 'str'>. Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
train. 150448: <class 'str'>. Roman Atwood is a content creator.
dev. Queries: 6666
dev. 137334: <class 'str'>. Fox 2000 Pictures released the film Soul Food.
dev. 111897: <class 'str'>. Telemundo is a English-language television network.
test. Queries: 6666
test. 163803: <class 'str'>. Ukrainian Soviet Socialist Republic was a founding participant of the UN.
test. 70041: <class 'str'>. 2 Hearts is a musical composition by Minogue.


In [20]:
fever.queries['train']['150448']

'Roman Atwood is a content creator.'

In [24]:
for split, reldocs in fever.relevant_docs.items():
    print(f'{split}. Reldocs: {len(reldocs)}')
    qids = itertools.islice(reldocs.keys(), 2)
    for qid in qids:
        query = fever.queries[split][qid]
        print(f'{split}. {qid}. {query}\n    {reldocs[qid]}')

train. Reldocs: 109810
train. 75397. Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
    {'Fox_Broadcasting_Company': 1, 'Nikolaj_Coster-Waldau': 1}
train. 150448. Roman Atwood is a content creator.
    {'Roman_Atwood': 1}
dev. Reldocs: 6666
dev. 137334. Fox 2000 Pictures released the film Soul Food.
    {'Soul_Food_(film)': 1}
dev. 111897. Telemundo is a English-language television network.
    {'Telemundo': 1, 'Hispanic_and_Latino_Americans': 1}
test. Reldocs: 6666
test. 163803. Ukrainian Soviet Socialist Republic was a founding participant of the UN.
    {'Ukrainian_Soviet_Socialist_Republic': 1, 'United_Nations': 1}
test. 70041. 2 Hearts is a musical composition by Minogue.
    {'2_Hearts_(Kylie_Minogue_song)': 1}


In [13]:
corpus = fever.corpus['train']
for split in 'train', 'dev', 'test':
    reldocs = fever.relevant_docs[split]
    max_sz = 0
    vals = set()
    for reldoc in reldocs.values():
        max_sz = max(max_sz, len(reldoc))
        for docid, val in reldoc.items():
            docid_src = docid
            if docid not in corpus:
                print(docid)
            if docid[0] == '"' and docid[-1] == '"':
                docid = docid[1:-1].replace('""', '"')
            assert docid in corpus, f'`{docid_src}` -> `{docid}`'
            vals.add(val)
    print(f'{split}. max_sz: {max_sz}. vals: {vals}')

"The_""Chirping""_Crickets"
"The_""Chirping""_Crickets"
"""Heroes""_(David_Bowie_album)"
train. max_sz: 24. vals: {1}
dev. max_sz: 16. vals: {1}
test. max_sz: 15. vals: {1}


In [14]:
def write_qs(queries: dict[str, str], fpath: Path) -> pd.DataFrame:
    qids, qs = [], []
    for qid, query in queries.items():
        assert '\t' not in query, query
        qid = int(qid)
        qids.append(qid)
        qs.append(query)
    df = pd.DataFrame({'queryid': qids, 'query': qs})
    write_tsv(df, fpath)
    return df

for split, qs in fever.queries.items():
    fname = FEVER_QS_FPATH.with_suffix('')
    fname = f'{fname}_{split}.tsv'
    fpath = FEVER_QS_FPATH.parent / fname
    print(f'Writing {len(qs)} queries into {fpath}')
    write_qs(qs, fpath)

Writing 109810 queries into /home/misha/data/fever/queries_train.tsv
Writing 6666 queries into /home/misha/data/fever/queries_dev.tsv
Writing 6666 queries into /home/misha/data/fever/queries_test.tsv


In [15]:
def write_docs(docs: dict[str, dict[str, str]], docs_fpath: Path, docs_off_fpath: Path) -> tuple[pd.DataFrame, dict[str, int]]:
    n_docs = len(docs)
    docids = sorted(docs.keys())
    pbar = tqdm(docids, total=n_docs, desc='Fever corpus dump', unit='doc')
    docidn, docoff = np.arange(n_docs), np.full(n_docs, 0, dtype=int)
    docid_to_num = {}
    with open(docs_fpath, 'w', encoding='utf-8') as f:
        for i, docid in enumerate(pbar):
            docid_to_num[docid] = docidn[i]
            doc = docs[docid]
            off = f.tell()
            docoff[i] = off
            title, text = doc['title'], doc['text']
            assert '\t' not in title and '\t' not in text
            f.write(f'{docidn[i]}\t{docid}\t{title}\t{text}\n')
    df_off = pd.DataFrame({'docidn': docidn, 'offset': docoff})
    write_tsv(df_off, docs_off_fpath)
    return df_off, docid_to_num

df_off, docid_to_num = write_docs(corpus, FEVER_DOCS_FPATH, FEVER_DOCS_OFF_FPATH)

Fever corpus dump: 100%|██████████| 5416568/5416568 [00:41<00:00, 131451.23doc/s]


In [17]:
def write_qrels(qrels: dict[str, dict[str, int]], docid_to_num: dict[str, int], fpath: Path) -> pd.DataFrame:
    qids, dids = [], []
    for qid, reldocs in qrels.items():
        qid = int(qid)
        for docid, num in reldocs.items():
            assert num == 1
            # This fix is needed to find docid in Fever corpus
            if docid[0] == '"' and docid[-1] == '"':
                docid = docid[1:-1].replace('""', '"')
            docid_num = docid_to_num[docid]
            qids.append(qid)
            dids.append(docid_num)
    df_qrels = pd.DataFrame({'queryid': qids, 'docidn': dids})
    write_tsv(df_qrels, fpath)
    return df_qrels

for split, qrels in fever.relevant_docs.items():
    fname = FEVER_QRELS_FPATH.with_suffix('')
    fname = f'{fname}_{split}.tsv'
    fpath = FEVER_QRELS_FPATH.parent / fname
    print(f'Writing {len(qrels)} qrels into {fpath}')
    write_qrels(qrels, docid_to_num, fpath)

Writing 109810 qrels into /home/misha/data/fever/qrels_train.tsv
Writing 6666 qrels into /home/misha/data/fever/qrels_dev.tsv
Writing 6666 qrels into /home/misha/data/fever/qrels_test.tsv


## SadeemQuestionRetrieval

In [17]:
from mteb import SadeemQuestionRetrieval

In [18]:
ds = SadeemQuestionRetrieval()
print(ds)

SadeemQuestionRetrieval(name='SadeemQuestionRetrieval', languages=['ara'])


In [19]:
ds.load_data()

In [25]:
for split, item in ds.corpus.items():
    reldocs = ds.relevant_docs[split]
    print(split, len(reldocs))
    for doc_id, doc in itertools.islice(reldocs.items(), 3):
        print(doc_id, list(doc.keys()), list(doc.values()))

test 2089
1 ['29'] [1]
2 ['50'] [1]
3 ['83'] [1]


In [44]:
print('splits:', list(ds.corpus.keys()))
for split, docs in ds.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        txt = docs[doc_id]['text'].replace('\n', '\\n')
        print(f'{split}. {doc_id}. {txt[:200]}')

splits: ['test']
test. Docs: 22979
test. 1. وتعد الضميرية من أهم مراكز المدينة المنورة وتقع في الجزء الجنوبي الشرقي من منطقة المدينة المنورة على بعد (90) كلم وهي تابعة لمحافظة الحناكية وتبعد عنها 166 كم . وتبعد عن محافظة مهد الذهب 85 كم
test. 2. في الاحتفال بيوم النباتات الطلابية في هلسنكي في 13 مايو 1848، تم رفع علم اتحاد طلاب هلسنكي. كان علمًا مُخيطًا من قماش الحرير الأبيض للاحتفال، مع شعار الأسد لدوقية فنلندا الكبرى محاطًا بأوراق الغار. تم


In [45]:
list(ds.relevant_docs['test'].items())[:3]

[('1', {'29': 1}), ('2', {'50': 1}), ('3', {'83': 1})]

## StackOverflowQARetrieval

In [26]:
from mteb import StackOverflowQARetrieval

In [29]:
ds = StackOverflowQARetrieval()
ds.load_data()

Downloading readme:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Map:   0%|          | 0/1994 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19931 [00:00<?, ? examples/s]

In [30]:
print(list(ds.relevant_docs.keys()))


['test']


In [31]:
print('splits:', list(ds.corpus.keys()))
for split, docs in ds.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        txt = docs[doc_id].replace('\n', '\\n')
        print(f'{split}. {doc_id}. {txt[:200]}')


splits: ['test']
test. Docs: 19931
test. d1.  The SearchDatabase class that SphinxSearch extends was changed from REL1_31 to REL1_32. It now requires you to define doSearchTextInDB and doSearchTitleInDB methods.\nSee REL1_31 https://doc.wikimedi
test. d2.  you have to write below way because CTE is part of the SELECT not the UPDATE\nupdate work_request \nset name = name || '_old'\n\n   where exists (\n      with wr_double as\n         (select...)\n    


In [26]:
list(ds.relevant_docs['test'].items())[:3]

[('q17938', {'d17938': 1}),
 ('q17939', {'d17939': 1}),
 ('q17940', {'d17940': 1})]

## CodeFeedbackMT

In [32]:
from mteb import CodeFeedbackMT

In [33]:
ds = CodeFeedbackMT()
ds.load_data()

Downloading readme:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.3M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/66383 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66383 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/66383 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/66383 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/154k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/53106 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/13277 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13277 [00:00<?, ? examples/s]

Map:   0%|          | 0/13277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66383 [00:00<?, ? examples/s]

In [34]:
print('splits:', list(ds.corpus.keys()))
for split, docs in ds.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        txt = docs[doc_id].replace('\n', '\\n')
        print(f'{split}. {doc_id}. {txt[:200]}')


splits: ['test']
test. Docs: 66383
test. c1.  Regrettably, there are no standard Python libraries available for quantum computing that could generate a simple block of code relevant to this conversation. Quantum computing requires specialized so
test. c2.  Creating a Mozilla Firefox browser add-on involves programming, specifically using JavaScript and designing with CSS. We'll outline the key steps below for creating an add-on that modifies scrollbars
