# An enhancement of the dataset Multi30k with 90 more sentences from Tatoeba dataset

Multi30k-dataset
@InProceedings{elliott-EtAl:2017:WMT,
  author    = {Elliott, Desmond  and  Frank, Stella  and  Barrault, Lo\"{i}c  and  Bougares, Fethi  and  Specia, Lucia},
  title     = {Findings of the Second Shared Task on Multimodal Machine Translation and Multilingual Image Description},
  booktitle = {Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {215--233},
  url       = {http://www.aclweb.org/anthology/W17-4718}
}



CC-BY 2.0 (France) Attribution: tatoeba.org

In [53]:
import random

seed = 1234
random.seed(seed)

In [54]:
# Reading the combined file
def read_combined_file(file_name, limit=96000):
    fr_sequence = []
    en_sequence = []
    with open(file_name, encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i == limit:
                break
            pair = line.split('\t')
            fr_sequence.append(pair[1])
            en_sequence.append(pair[0])
    return fr_sequence, en_sequence 

In [55]:
# Manually splitting the training, test and validation set
data_size = 96000
data_indices = list(range(data_size))
fr_data, en_data = read_combined_file('data/fra.txt', limit=data_size)
test_rows = 3000
validation_rows = 3000
test_indices = random.sample(data_indices, test_rows)
validation_indices = random.sample([i for i in data_indices if i not in test_indices], validation_rows)
train_indices = [x for x in data_indices if x not in test_indices and x not in validation_indices]

train_fr_data = [fr_data[i] for i in train_indices]
test_fr_data = [fr_data[i] for i in test_indices]
validation_fr_data = [fr_data[i] for i in validation_indices]

train_en_data = [en_data[i] for i in train_indices]
test_en_data = [en_data[i] for i in test_indices]
validation_en_data = [en_data[i] for i in validation_indices]

In [56]:
def load_and_extend_data(fr_file, en_file, /, fr_extension=[], en_extension=[]):
    with open(fr_file, encoding='utf-8') as file:
        fr_lines = file.read().splitlines()
        fr_extension.extend(fr_lines)
        print(fr_lines[:4])
    with open(en_file, encoding='utf-8') as file:
        en_lines = file.read().splitlines()
        en_extension.extend(en_lines)
    return [{"fr": fr, "en": en} for fr, en in zip(fr_extension, en_extension)]

train_data = load_and_extend_data('data/train.fr', 'data/train.en', fr_extension=train_fr_data, en_extension=train_en_data)
test_data = load_and_extend_data('data/test_2018_flickr.fr', 'data/test_2018_flickr.en', fr_extension=test_fr_data, en_extension=test_en_data)
val_data = load_and_extend_data('data/val.fr', 'data/val.en', fr_extension=validation_fr_data, en_extension=validation_en_data)



['Deux jeunes hommes blancs sont dehors près de buissons.', 'Plusieurs hommes en casque font fonctionner un système de poulies géant.', 'Une petite fille grimpe dans une maisonnette en bois.', 'Un homme dans une chemise bleue se tient sur une échelle pour nettoyer une fenêtre.']
['Un jeune homme participe à une course pendant que le sujet qui le filme sourit.', "L'homme se gratte l'arrière du cou tout en cherchant un livre dans une librairie.", 'Une personne portant des lunettes de protection et un chapeau fait de la luge.', "Une fille avec une veste rose et des galoches à fleurs descend le long d'une colline en luge."]
["Un groupe d'hommes chargent du coton dans un camion", 'Un homme dormant dans une chambre verte sur un canapé.', "Un garçon avec un casque est assis sur les épaules d'une femme.", 'Deux hommes installant une tente de pêche sur glace bleue sur un lac gelé']


In [57]:
from datasets import Dataset, DatasetDict

In [63]:
train_dataset = Dataset.from_dict({"fr": train_fr_data, "en": train_en_data})
test_dataset = Dataset.from_dict({"fr": test_fr_data, "en": test_en_data})
val_dataset = Dataset.from_dict({"fr": validation_fr_data, "en": validation_en_data})

# Create DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset
})


dataset.description = """
## Multi30k + Tatoeba French-English Translation Dataset

This dataset contains the Multi30k dataset available [here](https://github.com/multi30k/dataset), combined with additional 90 sentences from the Tatoeba French-English translation dataset, sourced from [Tatoeba.org](https://tatoeba.org/).

### Dataset Information

Each example consists of a French sentence ("fr") and its English translation ("en").

### Data Splits

The French-English dataset has 3 splits:

| Dataset Split | Number of Instances |
|---------------|---------------------|
| Train         | 119,003             |
| Validation    | 4,017               |
| Test          | 4,074               |

### Attribution

CC-BY 2.0 (France) Attribution: [Tatoeba.org](https://tatoeba.org/) French-English dataset.
"""

dataset.citation = """
@InProceedings{elliott-EtAl:2017:WMT,
  author    = {Elliott, Desmond  and  Frank, Stella  and  Barrault, Lo\"{i}c  and  Bougares, Fethi  and  Specia, Lucia},
  title     = {Findings of the Second Shared Task on Multimodal Machine Translation and Multilingual Image Description},
  booktitle = {Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {215--233},
  url       = {http://www.aclweb.org/anthology/W17-4718}
}

CC-BY 2.0 (France) Attribution: [Tatoeba.org](https://tatoeba.org/) French-English dataset.
"""
dataset.push_to_hub(repo_id="multi30k_plus90K_fr")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/119 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/kevindf/multi30k_plus90K_fr/commit/e968fc9f17b53f4982c03a3c7b257200ce02cd78', commit_message='Upload dataset', commit_description='', oid='e968fc9f17b53f4982c03a3c7b257200ce02cd78', pr_url=None, pr_revision=None, pr_num=None)