# Téléchargement de RVL-CDIP

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""RVL-CDIP (Ryerson Vision Lab Complex Document Information Processing) dataset"""

# !pip install datasets
import os

import datasets
from datasets.tasks import ImageClassification


_CITATION = """\
@inproceedings{harley2015icdar,
    title = {Evaluation of Deep Convolutional Nets for Document Image Classification and Retrieval},
    author = {Adam W Harley and Alex Ufkes and Konstantinos G Derpanis},
    booktitle = {International Conference on Document Analysis and Recognition ({ICDAR})}},
    year = {2015}
}
"""


_DESCRIPTION = """\
The RVL-CDIP (Ryerson Vision Lab Complex Document Information Processing) dataset consists of 400,000 grayscale images in 16 classes, with 25,000 images per class. There are 320,000 training images, 40,000 validation images, and 40,000 test images.
"""


_HOMEPAGE = "https://www.cs.cmu.edu/~aharley/rvl-cdip/"


_LICENSE = "https://www.industrydocuments.ucsf.edu/help/copyright/"


_URLS = {
    "rvl-cdip": "https://huggingface.co/datasets/rvl_cdip/resolve/main/data/rvl-cdip.tar.gz",
}

_METADATA_URLS = {
    "train": "https://huggingface.co/datasets/rvl_cdip/resolve/main/data/train.txt",
    "test": "https://huggingface.co/datasets/rvl_cdip/resolve/main/data/test.txt",
    "val": "https://huggingface.co/datasets/rvl_cdip/resolve/main/data/val.txt",
}

_CLASSES = [
    "letter",
    "form",
    "email",
    "handwritten",
    "advertisement",
    "scientific report",
    "scientific publication",
    "specification",
    "file folder",
    "news article",
    "budget",
    "invoice",
    "presentation",
    "questionnaire",
    "resume",
    "memo",
]

_IMAGES_DIR = "images/"


class RvlCdip(datasets.GeneratorBasedBuilder):
    """Ryerson Vision Lab Complex Document Information Processing dataset."""

    VERSION = datasets.Version("1.0.0")

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "image": datasets.Image(),
                    "label": datasets.ClassLabel(names=_CLASSES),
                }
            ),
            supervised_keys=("image", "label"),
            homepage=_HOMEPAGE,
            citation=_CITATION,
            license=_LICENSE,
            task_templates=[ImageClassification(image_column="image", label_column="label")],
        )

    def _split_generators(self, dl_manager):
        archive_path = dl_manager.download(_URLS["rvl-cdip"])
        labels_path = dl_manager.download(_METADATA_URLS)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "archive_iterator": dl_manager.iter_archive(archive_path),
                    "labels_filepath": labels_path["train"],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "archive_iterator": dl_manager.iter_archive(archive_path),
                    "labels_filepath": labels_path["test"],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "archive_iterator": dl_manager.iter_archive(archive_path),
                    "labels_filepath": labels_path["val"],
                },
            ),
        ]

    @staticmethod
    def _get_image_to_class_map(data):
        image_to_class_id = {}
        for item in data:
            image_path, class_id = item.split(" ")
            image_path = os.path.join(_IMAGES_DIR, image_path)
            image_to_class_id[image_path] = int(class_id)

        return image_to_class_id

    def _generate_examples(self, archive_iterator, labels_filepath):

        with open(labels_filepath, encoding="utf-8") as f:
            data = f.read().splitlines()

        image_to_class_id = self._get_image_to_class_map(data)

        for file_path, file_obj in archive_iterator:
            if file_path.startswith(_IMAGES_DIR):
                if file_path in image_to_class_id:
                    class_id = image_to_class_id[file_path]
                    label = _CLASSES[class_id]
                    yield file_path, {"image": {"path": file_path, "bytes": file_obj.read()}, "label": label}


from datasets import load_dataset

# Charger le dataset RVL-CDIP
dataset = load_dataset('rvl_cdip')

# Accéder aux données d'entraînement
train_dataset = dataset['train']
print(train_dataset)

# Accéder aux données de test
test_dataset = dataset['test']
print(test_dataset)

# Accéder aux données de validation
val_dataset = dataset['validation']
print(val_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

The repository for rvl_cdip contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rvl_cdip.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/38.8G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/320000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/40000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40000 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/64 [00:00<?, ?it/s]

Dataset({
    features: ['image', 'label'],
    num_rows: 320000
})
Dataset({
    features: ['image', 'label'],
    num_rows: 40000
})
Dataset({
    features: ['image', 'label'],
    num_rows: 40000
})


# Enregistrement des 3 datasets dans le drive

In [3]:
# Enregistrer les datasets
results_dir = '/content/drive/MyDrive/formation Datascientest/RVL-CDIP/'
train_dataset.save_to_disk(os.path.join(results_dir, 'train_dataset'))
test_dataset.save_to_disk(os.path.join(results_dir, 'test_dataset'))
val_dataset.save_to_disk(os.path.join(results_dir, 'val_dataset'))


Saving the dataset (0/78 shards):   0%|          | 0/320000 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/40000 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/40000 [00:00<?, ? examples/s]