In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from collections import defaultdict
import csv
from dataclasses import dataclass
import gzip
import itertools
import os
from pathlib import Path
from pprint import pprint
import re
import sys
from typing import Optional
if '..' not in sys.path: sys.path.append('..')

import mteb
import numpy as np
import pandas as pd
from tqdm import tqdm

from mllm.utils.utils import write_tsv, read_tsv

# MTEB Datasets
## Common

In [4]:
DATA_PATH = Path(os.path.expandvars('$HOME')) / 'data'
MTEB_PATH = DATA_PATH / 'mteb'
FEVER_PATH = DATA_PATH / 'fever'
FEVER_PATH.mkdir(parents=True, exist_ok=True)
FEVER_QS_FPATH = FEVER_PATH / 'queries.tsv'
FEVER_QRELS_FPATH = FEVER_PATH / 'qrels.tsv'
FEVER_DOCS_FPATH = FEVER_PATH / 'docs.tsv'
FEVER_DOCS_OFF_FPATH = FEVER_PATH / 'docs_offsets.tsv'

In [4]:
tasks = mteb.get_tasks(task_types=['Retrieval'])
for task in tasks:
    print(task)

SadeemQuestionRetrieval(name='SadeemQuestionRetrieval', languages=['ara'])
AppsRetrieval(name='AppsRetrieval', languages=['eng', 'python'])
CodeEditSearchRetrieval(name='CodeEditSearchRetrieval', languages=['c', 'c++', 'go', '...'])
CodeFeedbackMT(name='CodeFeedbackMT', languages=['c', 'eng', 'python', '...'])
CodeFeedbackST(name='CodeFeedbackST', languages=['eng', 'go', 'java', '...'])
CodeSearchNetCCRetrieval(name='CodeSearchNetCCRetrieval', languages=['go', 'java', 'javascript', '...'])
CodeSearchNetRetrieval(name='CodeSearchNetRetrieval', languages=['go', 'java', 'javascript', '...'])
CodeTransOceanContestRetrieval(name='CodeTransOceanContest', languages=['c++', 'python'])
CodeTransOceanDLRetrieval(name='CodeTransOceanDL', languages=['eng'])
CosQARetrieval(name='CosQA', languages=['eng', 'python'])
StackOverflowQARetrieval(name='StackOverflowQA', languages=['eng'])
SyntheticText2SQLRetrieval(name='SyntheticText2SQL', languages=['eng', 'sql'])
DanFeverRetrieval(name='DanFeverRetriev

In [5]:
langs = defaultdict(lambda: 0)
for task in tasks:
    for lang in task.languages:
        langs[lang] += 1
print([(lang, cnt) for (lang, cnt) in langs.items() if cnt >= 3])

[('ara', 7), ('eng', 81), ('python', 8), ('go', 4), ('java', 4), ('javascript', 4), ('php', 4), ('ruby', 4), ('dan', 5), ('deu', 17), ('ell', 3), ('fra', 13), ('jpn', 10), ('kor', 6), ('ben', 4), ('fin', 3), ('hin', 9), ('ita', 5), ('nob', 3), ('pol', 13), ('por', 5), ('ron', 3), ('rus', 9), ('spa', 11), ('swe', 4), ('tam', 3), ('tel', 3), ('tha', 4), ('tur', 3), ('vie', 5), ('zho', 8), ('fas', 4), ('cmn', 10)]


## Fever

In [11]:
from mteb import FEVER

In [12]:
fever = FEVER()
fever

FEVER(name='FEVER', languages=['eng'])

In [14]:
fever.load_data()

Map:   0%|          | 0/140085 [00:00<?, ? examples/s]

Map:   0%|          | 0/8079 [00:00<?, ? examples/s]

Map:   0%|          | 0/7937 [00:00<?, ? examples/s]

In [10]:
print('splits:', list(fever.corpus.keys()))
for split, docs in fever.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        print(f'{split}. {doc_id}. {docs[doc_id]}')

In [13]:
assert fever.corpus['train'] == fever.corpus['dev'] == fever.corpus['test']
fever.corpus['dev'] = None
fever.corpus['test'] = None

AttributeError: 'FEVER' object has no attribute 'corpus'

In [12]:
for split, qs in fever.queries.items():
    print(f'{split}. Queries: {len(qs)}')
    qids = itertools.islice(qs.keys(), 2)
    for qid in qids:
        print(f'{split}. {qid}: {type(qid)}. {qs[qid]}')

In [20]:
fever.queries['train']['150448']

In [24]:
for split, reldocs in fever.relevant_docs.items():
    print(f'{split}. Reldocs: {len(reldocs)}')
    qids = itertools.islice(reldocs.keys(), 2)
    for qid in qids:
        query = fever.queries[split][qid]
        print(f'{split}. {qid}. {query}\n    {reldocs[qid]}')

In [13]:
corpus = fever.corpus['train']
for split in 'train', 'dev', 'test':
    reldocs = fever.relevant_docs[split]
    max_sz = 0
    vals = set()
    for reldoc in reldocs.values():
        max_sz = max(max_sz, len(reldoc))
        for docid, val in reldoc.items():
            docid_src = docid
            if docid not in corpus:
                print(docid)
            if docid[0] == '"' and docid[-1] == '"':
                docid = docid[1:-1].replace('""', '"')
            assert docid in corpus, f'`{docid_src}` -> `{docid}`'
            vals.add(val)
    print(f'{split}. max_sz: {max_sz}. vals: {vals}')

In [14]:
def write_qs(queries: dict[str, str], fpath: Path) -> pd.DataFrame:
    qids, qs = [], []
    for qid, query in queries.items():
        assert '\t' not in query, query
        qid = int(qid)
        qids.append(qid)
        qs.append(query)
    df = pd.DataFrame({'queryid': qids, 'query': qs})
    write_tsv(df, fpath)
    return df

for split, qs in fever.queries.items():
    fname = FEVER_QS_FPATH.with_suffix('')
    fname = f'{fname}_{split}.tsv'
    fpath = FEVER_QS_FPATH.parent / fname
    print(f'Writing {len(qs)} queries into {fpath}')
    write_qs(qs, fpath)

In [15]:
def write_docs(docs: dict[str, dict[str, str]], docs_fpath: Path, docs_off_fpath: Path) -> tuple[pd.DataFrame, dict[str, int]]:
    n_docs = len(docs)
    docids = sorted(docs.keys())
    pbar = tqdm(docids, total=n_docs, desc='Fever corpus dump', unit='doc')
    docidn, docoff = np.arange(n_docs), np.full(n_docs, 0, dtype=int)
    docid_to_num = {}
    with open(docs_fpath, 'w', encoding='utf-8') as f:
        for i, docid in enumerate(pbar):
            docid_to_num[docid] = docidn[i]
            doc = docs[docid]
            off = f.tell()
            docoff[i] = off
            title, text = doc['title'], doc['text']
            assert '\t' not in title and '\t' not in text
            f.write(f'{docidn[i]}\t{docid}\t{title}\t{text}\n')
    df_off = pd.DataFrame({'docidn': docidn, 'offset': docoff})
    write_tsv(df_off, docs_off_fpath)
    return df_off, docid_to_num

df_off, docid_to_num = write_docs(corpus, FEVER_DOCS_FPATH, FEVER_DOCS_OFF_FPATH)

In [17]:
def write_qrels(qrels: dict[str, dict[str, int]], docid_to_num: dict[str, int], fpath: Path) -> pd.DataFrame:
    qids, dids = [], []
    for qid, reldocs in qrels.items():
        qid = int(qid)
        for docid, num in reldocs.items():
            assert num == 1
            # This fix is needed to find docid in Fever corpus
            if docid[0] == '"' and docid[-1] == '"':
                docid = docid[1:-1].replace('""', '"')
            docid_num = docid_to_num[docid]
            qids.append(qid)
            dids.append(docid_num)
    df_qrels = pd.DataFrame({'queryid': qids, 'docidn': dids})
    write_tsv(df_qrels, fpath)
    return df_qrels

for split, qrels in fever.relevant_docs.items():
    fname = FEVER_QRELS_FPATH.with_suffix('')
    fname = f'{fname}_{split}.tsv'
    fpath = FEVER_QRELS_FPATH.parent / fname
    print(f'Writing {len(qrels)} qrels into {fpath}')
    write_qrels(qrels, docid_to_num, fpath)

## SadeemQuestionRetrieval

In [6]:
from mteb import SadeemQuestionRetrieval

In [7]:
ds = SadeemQuestionRetrieval()
print(ds)

SadeemQuestionRetrieval(name='SadeemQuestionRetrieval', languages=['ara'])


In [8]:
ds.load_data()

Downloading readme:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.89M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152k [00:00<?, ?B/s]

Generating qrels split:   0%|          | 0/2089 [00:00<?, ? examples/s]

Generating corpus split:   0%|          | 0/22979 [00:00<?, ? examples/s]

Generating queries split:   0%|          | 0/2089 [00:00<?, ? examples/s]

In [9]:
for split, item in ds.corpus.items():
    reldocs = ds.relevant_docs[split]
    print(split, len(reldocs))
    for doc_id, doc in itertools.islice(reldocs.items(), 3):
        print(doc_id, list(doc.keys()), list(doc.values()))

test 2089
1 ['29'] [1]
2 ['50'] [1]
3 ['83'] [1]


In [44]:
print('splits:', list(ds.corpus.keys()))
for split, docs in ds.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        txt = docs[doc_id]['text'].replace('\n', '\\n')
        print(f'{split}. {doc_id}. {txt[:200]}')

In [45]:
list(ds.relevant_docs['test'].items())[:3]

## StackOverflowQARetrieval

In [10]:
from mteb import StackOverflowQARetrieval

In [11]:
ds = StackOverflowQARetrieval()
ds.load_data()

Downloading readme:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/19931 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19931 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/19931 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19931 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/159k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13951 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1994 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1994 [00:00<?, ? examples/s]

Map:   0%|          | 0/1994 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19931 [00:00<?, ? examples/s]

In [12]:
print(list(ds.relevant_docs.keys()))


['test']


In [15]:
print('splits:', list(ds.corpus.keys()))
for split, docs in ds.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        title, text = docs[doc_id]['title'], docs[doc_id]['text']
        text = text.replace('\n', '\\n')
        print(f'{split}. {doc_id}. `{title[:50]}` {text[:200]}')


splits: ['test']
test. Docs: 19931
test. d1. `` The SearchDatabase class that SphinxSearch extends was changed from REL1_31 to REL1_32. It now requires you to define doSearchTextInDB and doSearchTitleInDB methods.\nSee REL1_31 https://doc.wikimedia
test. d2. `` you have to write below way because CTE is part of the SELECT not the UPDATE\nupdate work_request \nset name = name || '_old'\n\n   where exists (\n      with wr_double as\n         (select...)\n     


In [26]:
list(ds.relevant_docs['test'].items())[:3]

## CodeFeedbackMT

In [15]:
from mteb import AILACasedocs

In [16]:
from datasets import Split

ds = AILACasedocs()
# eval_splits = ['train', 'dev', 'test']
# eval_splits = [Split.TRAIN, Split.VALIDATION, Split.TEST]
# eval_splits = ['test']
# ds.load_data(eval_splits=eval_splits)

In [19]:
print('splits:', list(ds.corpus.keys()))
for split, docs in ds.corpus.items():
    print(f'{split}. Docs: {len(docs)}')
    docids = itertools.islice(docs.keys(), 2)
    for doc_id in docids:
        title, text = docs[doc_id]['title'], docs[doc_id]['text']
        text = text.replace('\n', '\\n')
        print(f'{split}. {doc_id}. `{title[:50]}` {text[:200]}')

splits: ['test']
test. Docs: 186
test. FCIX3BYOtD. `` Central Inland Water Transport Corporation Limited and Another v Brojo Nath Ganguly and Another\nSupreme Court of India\n\n6 April 1986\nC.A. No. 4412 and 4413 of 1985\nThe Judgment was delivered by :
test. YsP6ihgIqL. `` West Bengal State Electricity Board and Others v Desh Bandhu Ghosh and Others\nSupreme Court of India\n\n26 February 1985\nCivil Appeal No. 562 of 1985\nThe Judgment was delivered by : O. Chinnappa Re


In [20]:
type(ds)

mteb.tasks.Retrieval.eng.AILACasedocsRetrieval.AILACasedocs

In [22]:
from mteb.tasks.Retrieval.eng import AILACasedocsRetrieval