We upload 2 datasets: qrels and collection

* Qrels ("google_questions"):
    * Configs: one for each country
    * Splits: only train for each country

* Collection ("eswiki-20240401-corpus"):
    * Only one file

NOTE To upload a dataset on the Hub in Python, you need to log in to your HF account: `huggingface-cli login`

In [1]:
UPLOAD_CORPUS = False
UPLOAD_QRELS = True

### corpus

In [3]:
import datasets

In [4]:
# Read wiki collection
collection_dataset = datasets.load_dataset("../runs/corpora/eswiki-20240401-corpus")

Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

In [5]:
collection_dataset

DatasetDict({
    train: Dataset({
        features: ['docid', 'title', 'text'],
        num_rows: 14047759
    })
})

In [6]:
import html

collection_dataset = collection_dataset.map(
    lambda example: {"text": html.unescape(example["text"])}
)

Map:   0%|          | 0/14047759 [00:00<?, ? examples/s]

In [None]:
repo_name = "spanish-ir/eswiki_20240401_corpus"

if UPLOAD_CORPUS:
    # Upload the corpus to the hub
    collection_dataset["train"].push_to_hub(repo_name, split="corpus")

### qrels

In [8]:
import sys

import pandas as pd
from datasets import Dataset, Value

sys.path.append('../scripts/google_questions')
from helpers import connect_to_db


In [9]:
# Read queries-texts from DB
def fetch_questions_answers(
        cur, min_version: int = 1, match_score: float = 0.5
    ) -> pd.DataFrame: 
    cur.execute(f"""
        SELECT e.id, q.country, q.question AS query
                ,e.match_score, e.corpus_docid AS docid, e.expanded_search
                ,e.answer_url, e.answer_type, h.date AS answer_date
                ,q.date AS query_date
        FROM extractions AS e
            INNER JOIN queries AS q ON e.id = q.id
            INNER JOIN htmls AS h ON e.id = h.id
        WHERE corpus_docid IS NOT NULL
            AND e.match_v >= {min_version}
            AND e.match_score >= {match_score}
        ;
    """)
    res = cur.fetchall()
    df = pd.DataFrame(res, columns=[desc[0] for desc in cur.description])
    return df

conn, cur = connect_to_db()
conn.autocommit = True

df_questions = fetch_questions_answers(cur, min_version=1, match_score=0)

In [10]:
# add "docid_text" from the corpus
df_collection = collection_dataset["train"].to_pandas()[["docid", "text", "title"]].drop_duplicates()
df_questions = df_questions.merge(
    df_collection, on="docid", how="left").rename(columns={"text": "docid_text", "title": "title_tmp"})
# We keep the title for the train/test split
del collection_dataset, df_collection

In [11]:
df_questions["docid_text"].isnull().sum()

0

In [12]:
df_final = df_questions.query("match_score > 0.5")[[
    "country", "id", "query", "docid", "docid_text", 
    "query_date", "answer_date",
    "match_score", "expanded_search", "answer_type", 
    "title_tmp"
]].copy()
# Remove queries ending with "brainly":
df_final = df_final[~df_final["query"].str.lower().str.endswith("brainly")].copy()
df_final["match_score"] = df_final["match_score"].apply(lambda x: round(x, 4)).astype("float32")
df_final.sort_values("id", inplace=True)

In [13]:
# rename "general" to "no_country":
df_final["country"] = df_final["country"].replace("general", "no_country")

In [14]:
# Split train/test: (Approx 80% train, 20% test)
# 1) An article name should be in only one of the splits
# 2) Test should always have match_score = 1 and expanded_search = False
# 3) A query should be in only one of the splits
# 4) For all questions at once, then we map back to the countries

initial_test_frac = 0.40
seed = 33

def print_stats(qids_train, qids_test):
    n_train = len(qids_train)
    n_test = len(qids_test)
    n_total = n_train + n_test
    print(f"Train queries: {n_train} ({n_train / n_total:.2%})")
    print(f"Test queries: {n_test} ({n_test / n_total:.2%})")

def train_test_split(df, aid_col, initial_test_frac, seed):
    """According to article and match_score and expanded_search"""
    art_ids = df[aid_col].unique()
    random_ints = pd.Series(range(len(art_ids))).sample(frac=1, random_state=seed).reset_index(drop=True)
    title2int = dict(zip(art_ids, random_ints))
    df["aid_int"] = df[aid_col].map(title2int)
    df = df.sort_values("aid_int")
    df["mean_score"] = df.groupby(aid_col)["match_score"].transform("mean")
    df["mean_expanded"] = df.groupby(aid_col)["expanded_search"].transform("mean")
    df = df.sort_values(["mean_score", "mean_expanded"], ascending=[False, True])
    df.drop(columns=["aid_int"], inplace=True)
    # split in train and test
    n_test = int(df.shape[0] * initial_test_frac)
    # New column with row number of the first question with the same title:
    df["row_number"] = range(df.shape[0])
    df["first_row_number"] = df.groupby(aid_col)["row_number"].transform("min")
    df["split"] = "train"
    df.loc[df["first_row_number"] <= n_test, "split"] = "test"
    df = df.drop(columns=["row_number", "first_row_number"])
    df_train = df.query("split == 'train'").drop(columns=["split"]).copy()
    df_test = df.query("split == 'test'").drop(columns=["split"]).copy()
    return df_train, df_test


# Split articles into train and test according to article and match_score, expanded_search:
df_tmp = df_final[["query", "docid", "match_score", "expanded_search", "title_tmp"]].copy()
qid_col = "query"
aid_col = "title_tmp" # article id
df_train, df_test = train_test_split(df_tmp, aid_col, initial_test_frac, seed)

# Check overlap between train and dev qids, aids:
qids_train = set(df_train[qid_col])
qids_test = set(df_test[qid_col])
print_stats(qids_train, qids_test)
overlap_qids = qids_train & qids_test
while len(overlap_qids) > 0:
    print(f"Overlap between train and test queries: moving {len(overlap_qids)} queries from test to train")
    df_train = pd.concat([df_train, df_test[df_test[qid_col].isin(overlap_qids)]])
    df_test = df_test[~df_test[qid_col].isin(overlap_qids)]
    overlap_aids = set(df_train[aid_col]) & set(df_test[aid_col])
    if len(overlap_aids) > 0:
        print(f"Overlap between train and test articles: moving {len(overlap_aids)} articles from test to train")
        df_train = pd.concat([df_train, df_test[df_test[aid_col].isin(overlap_aids)]])
        df_test = df_test[~df_test[aid_col].isin(overlap_aids)]
        overlap_qids = set(df_train[qid_col]) & set(df_test[qid_col])
    else:
        overlap_qids = set()
# Assert no qid overlap and no article overlap:
assert len(set(df_train[qid_col]) & set(df_test[qid_col])) == 0, "Qid overlap"
assert len(set(df_train[aid_col]) & set(df_test[aid_col])) == 0, "Article overlap"
qids_train = df_train[qid_col].unique()
qids_test = df_test[qid_col].unique()
print_stats(qids_train, qids_test)
df_train = df_train.drop(columns=[aid_col]).reset_index(drop=True)
df_test = df_test.drop(columns=[aid_col]).reset_index(drop=True)

Train queries: 529350 (59.08%)
Test queries: 366613 (40.92%)
Overlap between train and test queries: moving 17535 queries from test to train
Overlap between train and test articles: moving 5869 articles from test to train
Overlap between train and test queries: moving 1860 queries from test to train
Overlap between train and test articles: moving 903 articles from test to train
Overlap between train and test queries: moving 228 queries from test to train
Overlap between train and test articles: moving 111 articles from test to train
Overlap between train and test queries: moving 55 queries from test to train
Overlap between train and test articles: moving 15 articles from test to train
Overlap between train and test queries: moving 3 queries from test to train
Train queries: 708373 (80.64%)
Test queries: 170055 (19.36%)


In [15]:
# Check (should be near 0 for test)
print(f'{df_train["match_score"].lt(1).mean() * 100:.4f}')
print(f'{df_train["expanded_search"].eq(True).mean() * 100:.4f}')
print(f'{df_test["match_score"].lt(1).mean() * 100:.4f}')
print(f'{df_test["expanded_search"].eq(True).mean() * 100:.4f}')

29.4123
0.6409
2.4187
0.7176


In [16]:
# add split to df_final:
query2split = dict(zip(df_train[qid_col], ["train"] * len(df_train)))
query2split.update(dict(zip(df_test[qid_col], ["test"] * len(df_test))))
df_final["split"] = df_final["query"].map(query2split)

In [17]:
# dset "full": questions from country = "no_country" + unique (questions,docid) from countries
df_full = df_final.query("country == 'no_country'").copy()
df_resto = df_final.query("country != 'no_country'").copy()
df_full = pd.concat([df_full, df_resto]).sort_values("id")
df_full = df_full.drop_duplicates(["query", "docid"]).reset_index(drop=True).copy()
df_full["country"] = "full"
# We add a new ID for the queries in "full" because they are not unique:
df_full["id_country"] = df_full["id"].copy() # we keep the original id just in case
df_full = df_full.sort_values("query")
df_full["id"] = df_full.groupby("query").ngroup() + df_resto["id"].max() + 1  # One new ID per query
# Final dataset: countries + no_country + full
df_final = pd.concat([df_final, df_full]).sort_values("id").reset_index(drop=True)

In [18]:
# Check no title and no query in both splits in the each subset:
subsets = df_final["country"].unique().tolist()
for subset in subsets:
    df = df_final.query("country == @subset").copy()
    x = df[["title_tmp", "split"]].drop_duplicates()["title_tmp"].value_counts() # should be 0
    n_dups_articles = (x > 1).sum()
    x = df[["query", "split"]].drop_duplicates()["query"].value_counts()
    n_dups_queries = (x > 1).sum()
    print(f"{subset}: {n_dups_articles} duplicated articles, {n_dups_queries} duplicated queries")

mx: 0 duplicated articles, 0 duplicated queries
ve: 0 duplicated articles, 0 duplicated queries
pa: 0 duplicated articles, 0 duplicated queries
py: 0 duplicated articles, 0 duplicated queries
sv: 0 duplicated articles, 0 duplicated queries
pr: 0 duplicated articles, 0 duplicated queries
co: 0 duplicated articles, 0 duplicated queries
no_country: 0 duplicated articles, 0 duplicated queries
ec: 0 duplicated articles, 0 duplicated queries
bo: 0 duplicated articles, 0 duplicated queries
hn: 0 duplicated articles, 0 duplicated queries
es: 0 duplicated articles, 0 duplicated queries
cr: 0 duplicated articles, 0 duplicated queries
cl: 0 duplicated articles, 0 duplicated queries
do: 0 duplicated articles, 0 duplicated queries
uy: 0 duplicated articles, 0 duplicated queries
us: 0 duplicated articles, 0 duplicated queries
pe: 0 duplicated articles, 0 duplicated queries
gt: 0 duplicated articles, 0 duplicated queries
ar: 0 duplicated articles, 0 duplicated queries
ni: 0 duplicated articles, 0 dup

In [19]:
# Finally: check if any query from any "train" is in any "test":
queries_train = set(df_final.query("split == 'train'")["query"])
queries_test = set(df_final.query("split == 'test'")["query"])
queries_both = queries_train.intersection(queries_test)
print(len(queries_both))

0


In [20]:
# N unique queries by country and split:
df = df_final.groupby(["country", "split"])["query"].nunique().unstack()
df["total"] = df.sum(axis=1)
df.sort_values("test", ascending=True)

split,test,train,total
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
us,5410,35853,41263
gt,6127,35658,41785
bo,6614,40650,47264
cu,6688,35514,42202
py,7015,39158,46173
hn,7228,42311,49539
ve,7259,41802,49061
sv,7274,40008,47282
ar,7291,34460,41751
cl,7295,35635,42930


In [21]:
# Check % train/test by country:
df_final.groupby(["country", "split"])["query"].nunique().unstack().fillna(0).apply(
    lambda x: x / x.sum(), axis=1
)

split,test,train
country,Unnamed: 1_level_1,Unnamed: 2_level_1
ar,0.174631,0.825369
bo,0.139937,0.860063
cl,0.169928,0.830072
co,0.156209,0.843791
cr,0.165532,0.834468
cu,0.158476,0.841524
do,0.141831,0.858169
ec,0.153699,0.846301
es,0.208365,0.791635
full,0.19359,0.80641


In [22]:
# query with the most positives:
df_final.query("country == 'full'")["query"].value_counts().head()

query
quienes realizan la fotosíntesis                                    7
hasta qué fecha gobernó el triunvirato                              7
cual es la palabra mas larga del mundo                              6
de quién se independizaron las 13 colonias                          6
ante algún desastre natural cuál es la institución especializada    6
Name: count, dtype: int64

In [30]:
df_final.drop(columns=["title_tmp"], inplace=True)

In [31]:
df_final.head(2)

Unnamed: 0,country,id,query,docid,docid_text,query_date,answer_date,match_score,expanded_search,answer_type,split,id_country
0,mx,4,1 kg a cuantos miligramos equivale,1595#34,"1 ""kilogramo"" es equivalente a:",2024-04-10,2024-04-17,0.6829,False,feat_snip,train,
1,mx,5,1 kilo a cuantos miligramos equivale,1595#34,"1 ""kilogramo"" es equivalente a:",2024-04-10,2024-04-17,0.6829,False,feat_snip,train,


In [32]:
countries = sorted(df_final["country"].unique().tolist())

#### Save

In [33]:
repo_name = "spanish-ir/messirve"
version_tag = "1.1"

In [34]:
# Save qrels to disk:
qrels_file = f"../runs/google_questions/qrels_{version_tag}.csv.gz"
df_final.to_csv(qrels_file, index=False)

In [35]:
# Size of the qrels file:
!du -h $qrels_file

417M	../runs/google_questions/qrels_1.1.csv.gz


In [44]:
import huggingface_hub
from pprint import pprint

pprint(huggingface_hub.list_repo_refs(repo_name, repo_type="dataset"))

GitRefs(branches=[GitRefInfo(name='1.1',
                             ref='refs/heads/1.1',
                             target_commit='18fb3e19a3356e76c8bdf3fd1f34d3f9fb1ab9c5'),
                  GitRefInfo(name='main',
                             ref='refs/heads/main',
                             target_commit='13315db5159823daa4a0a550870d524b13699b25')],
        converts=[GitRefInfo(name='parquet',
                             ref='refs/convert/parquet',
                             target_commit='53ad717dc9a9495135b87870617d927f11a6d1b7')],
        tags=[GitRefInfo(name='1.1',
                         ref='refs/tags/1.1',
                         target_commit='18fb3e19a3356e76c8bdf3fd1f34d3f9fb1ab9c5'),
              GitRefInfo(name='1.0',
                         ref='refs/tags/1.0',
                         target_commit='13315db5159823daa4a0a550870d524b13699b25'),
              GitRefInfo(name='0.1',
                         ref='refs/tags/0.1',
                         targ

In [None]:
if UPLOAD_QRELS:
    # Create a branch for the new version:
    huggingface_hub.create_branch(repo_name, branch=version_tag, repo_type="dataset")

In [43]:
if UPLOAD_QRELS:
    # Upload to Hugging Face Hub, one config per country:
    # NOTE time.sleep to avoid rate limit
    import time

    while len(countries) > 0:
        country = countries.pop(0)
        # if country not in ["full", "no_country"]:
            # continue
        print("Uploading:", country)
        df_country_train = df_final.query(
            f"(country == '{country}') & (split == 'train')").drop(
                columns=["country", "split"]).reset_index(drop=True).copy()
        df_country_test = df_final.query(
            f"(country == '{country}') & (split == 'test')").drop(
                columns=["country", "split"]).reset_index(drop=True).copy()
        if country not in ["full"]:
            df_country_train = df_country_train.drop(columns=["id_country"])
            df_country_test = df_country_test.drop(columns=["id_country"])
        dset_train = Dataset.from_pandas(df_country_train)
        dset_test = Dataset.from_pandas(df_country_test)
        # dates:
        dset_train = dset_train.cast_column("query_date", Value("date32"))
        dset_train = dset_train.cast_column("answer_date", Value("date32"))
        dset_test = dset_test.cast_column("query_date", Value("date32"))
        dset_test = dset_test.cast_column("answer_date", Value("date32"))
        try:
            dset_train.push_to_hub(repo_name, country, split="train", revision=version_tag)
            dset_test.push_to_hub(repo_name, country, split="test", revision=version_tag)
        except Exception as e:
            print(e)
            time.sleep(60)
            print("Sleeping 60s and retrying...")
            countries.append(country)
        else:
            print("Done:", country)
            time.sleep(30)

    # Tag current version:
    huggingface_hub.create_tag(repo_name, tag=version_tag, revision=version_tag, repo_type="dataset")


Uploading: ar


Casting the dataset:   0%|          | 0/34460 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/34460 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7291 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7291 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: ar
Uploading: bo


Casting the dataset:   0%|          | 0/40650 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40650 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6614 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6614 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: bo
Uploading: cl


Casting the dataset:   0%|          | 0/35635 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35635 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7295 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7295 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: cl
Uploading: co


Casting the dataset:   0%|          | 0/39724 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/39724 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7354 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7354 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: co
Uploading: cr


Casting the dataset:   0%|          | 0/38005 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38005 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7539 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7539 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: cr
Uploading: cu


Casting the dataset:   0%|          | 0/35514 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35514 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6688 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6688 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: cu
Uploading: do


Casting the dataset:   0%|          | 0/44224 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44224 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7309 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7309 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: do
Uploading: ec


Casting the dataset:   0%|          | 0/43235 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/43235 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7852 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7852 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: ec
Uploading: es


Casting the dataset:   0%|          | 0/36055 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/36055 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9490 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9490 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: es
Uploading: full


Casting the dataset:   0%|          | 0/766296 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/766296 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/174078 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/174078 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/767 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/175 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: full
Uploading: gt


Casting the dataset:   0%|          | 0/35658 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35658 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6127 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6127 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: gt
Uploading: hn


Casting the dataset:   0%|          | 0/42311 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42311 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7228 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7228 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: hn
Uploading: mx


Casting the dataset:   0%|          | 0/50714 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/50714 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10650 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10650 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/51 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: mx
Uploading: ni


Casting the dataset:   0%|          | 0/44228 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/44228 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7444 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7444 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: ni
Uploading: no_country


Casting the dataset:   0%|          | 0/438190 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/438190 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100194 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100194 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/439 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/101 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: no_country
Uploading: pa


Casting the dataset:   0%|          | 0/41624 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41624 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7521 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7521 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: pa
Uploading: pe


Casting the dataset:   0%|          | 0/41169 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41169 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7333 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7333 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: pe
Uploading: pr


Casting the dataset:   0%|          | 0/41019 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41019 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7808 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7808 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: pr
Uploading: py


Casting the dataset:   0%|          | 0/39158 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/39158 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7015 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7015 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: py
Uploading: sv


Casting the dataset:   0%|          | 0/40008 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/40008 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7274 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7274 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: sv
Uploading: us


Casting the dataset:   0%|          | 0/35853 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/35853 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5410 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: us
Uploading: uy


Casting the dataset:   0%|          | 0/33208 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/33208 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7307 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7307 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: uy
Uploading: ve


Casting the dataset:   0%|          | 0/41802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/41802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7259 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7259 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Done: ve
