<h1> Notebook to prepare test datasets / some of the database datasets and push them to Hugging Face <h1>

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
from tqdm import  tqdm
import jieba

<h1> OASST <h1>

In [2]:
oass_train = load_dataset("OpenAssistant/oasst2", split="train").to_pandas()
oass_valid = load_dataset("OpenAssistant/oasst2", split="validation").to_pandas()
oass_full = pd.concat([oass_train, oass_valid,])
oass_full.reset_index(drop=True, inplace=True)

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/63.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/128575 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6599 [00:00<?, ? examples/s]

In [3]:
oass_full["review_result"].value_counts()

review_result
True     129517
False      3158
Name: count, dtype: int64

In [4]:
oass_train = load_dataset("OpenAssistant/oasst2", split="train").to_pandas()
oass_valid = load_dataset("OpenAssistant/oasst2", split="validation").to_pandas()
oass_full = pd.concat([oass_train, oass_valid,])
oass_full.reset_index(drop=True, inplace=True)

needed_langs = ["en", "ar", "de", "es", "vi", "zh"]
rows = []
for lang in tqdm(needed_langs):
    print(f"Processing lang: {lang}")
    filtered_df = oass_full[(oass_full["lang"] == lang) & (oass_full["role"] == "assistant")]
    for i, answer in filtered_df.iterrows():
        query = oass_full[oass_full["message_id"] == answer["parent_id"]]["text"].iloc[0]
        rows.append([answer["lang"], answer["message_id"], answer["parent_id"], answer["user_id"], answer["created_date"], query, answer["text"], answer["review_count"]])
        
filtered_dataset = pd.DataFrame(rows, columns=["lang", "message_id", "parent_id", "user_id", "created_date", "query", "answer", "review_count"])
filtered_dataset.drop_duplicates(subset="answer", inplace=True)
filtered_dataset.reset_index(drop=True, inplace=True)

filtered_dataset["answer_len"] = [len(row["answer"].split(" ")) if row["lang"] != "zh" else len(jieba.lcut(row["answer"])) for _, row in filtered_dataset.iterrows()]
filtered_dataset = filtered_dataset[filtered_dataset["answer_len"] >= 20]
filtered_dataset.reset_index(drop=True, inplace=True)

  0%|          | 0/6 [00:00<?, ?it/s]

Processing lang: en


 17%|█▋        | 1/6 [04:04<20:20, 244.02s/it]

Processing lang: ar


 33%|███▎      | 2/6 [04:04<06:42, 100.68s/it]

Processing lang: de


 50%|█████     | 3/6 [04:25<03:13, 64.52s/it] 

Processing lang: es


 67%|██████▋   | 4/6 [06:16<02:45, 82.68s/it]

Processing lang: vi


 83%|████████▎ | 5/6 [06:17<00:53, 53.18s/it]

Processing lang: zh


100%|██████████| 6/6 [06:48<00:00, 68.14s/it]
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.456 seconds.
Prefix dict has been built successfully.


In [5]:
hf_dataset = Dataset.from_pandas(filtered_dataset)
hf_dataset.push_to_hub("dkoterwa/oasst2_filtered")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dkoterwa/oasst2_filtered/commit/d754d91215e924995a71201201b9d2921d545611', commit_message='Upload dataset', commit_description='', oid='d754d91215e924995a71201201b9d2921d545611', pr_url=None, pr_revision=None, pr_num=None)

<h1> MKQA <h1>

In [None]:
mkqa = load_dataset("mkqa", split="train").to_pandas()

In [None]:
needed_langs = ["en", "ar", "de", "es", "vi", "zh_cn"]
rows = []
for i, row in tqdm(mkqa.iterrows(), total=mkqa.shape[0]):
    for lang in needed_langs:
        rows.append([lang, row["example_id"], row["queries"][lang], row["answers"][lang][0]["text"]])
        
filtered_dataset = pd.DataFrame(rows, columns=["lang", "example_id", "query", "answer"])
filtered_dataset.dropna(inplace=True)
filtered_dataset.reset_index(drop=True, inplace=True)

In [None]:
hf_dataset = Dataset.from_pandas(filtered_dataset)
hf_dataset.push_to_hub("dkoterwa/mkqa_filtered")

<h1> MLQA <h1>

In [None]:
def download_mlqa(subset_name):
    dataset_valid = load_dataset("mlqa", subset_name, split="validation").to_pandas()
    dataset_test  = load_dataset("mlqa", subset_name, split="test").to_pandas()
    full_dataset = pd.concat([dataset_valid, dataset_test])
    full_dataset.reset_index(drop=True, inplace=True)
    return full_dataset

needed_langs = ["mlqa.en.en", "mlqa.de.de", "mlqa.ar.ar", "mlqa.es.es", "mlqa.vi.vi", "mlqa.zh.zh"]
datasets = []
for lang in tqdm(needed_langs):
    dataset = download_mlqa(lang)
    dataset["lang"] = lang.split(".")[2]
    datasets.append(dataset)
    
full_mlqa = pd.concat(datasets)
full_mlqa.reset_index(drop=True, inplace=True)
full_mlqa["answer"] = [answer_dict["text"][0] for answer_dict in full_mlqa["answers"]]
full_mlqa.drop("answers", axis=1, inplace=True)

In [None]:
hf_dataset = Dataset.from_pandas(full_mlqa)
hf_dataset.push_to_hub("dkoterwa/mlqa_filtered")

<h1> Database datasets <h1>

In [None]:
datasets = {"FreedomIntelligence/alpaca-gpt4-deutsch": "de",
            "FreedomIntelligence/alpaca-gpt4-spanish": "es",
            "FreedomIntelligence/alpaca-gpt4-chinese": "zh",
            "FreedomIntelligence/alpaca-gpt4-arabic": "ar"}

In [None]:
for dataset_name, language in datasets.items():
    dataset = load_dataset(dataset_name, split="train").to_pandas()
    dataset["answer"] = [conversation[1]["value"] for conversation in dataset["conversations"]]
    dataset["instruction"] = [conversation[0]["value"] for conversation in dataset["conversations"]]
    dataset.drop("conversations", axis=1, inplace=True)
    hf_dataset = Dataset.from_pandas(dataset)
    hf_dataset.push_to_hub(f"dkoterwa/alpaca_gpt_4_{language}")