In [1]:
from datasets import load_dataset
from datasets import load_from_disk

In [2]:
'''
Load a dataset

- if ``path`` is a canonical dataset on the HF Hub (ex: `glue`, `squad`)
              -> load the dataset builder from the dataset script in the github repository at huggingface/datasets
              e.g. ``'squad'`` or ``'glue'``.

- if ``path`` is a local directory (but doesn't contain a dataset script)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. ``'./path/to/directory/with/my/csv/data'``.
'''
datasets_emotion = load_dataset(path='dair-ai/emotion')  # 从Hugging Face加载dair-ai/emotion数据集
datasets_emotion

Using the latest cached version of the dataset since dair-ai/emotion couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'split' at C:\Users\duanm\.cache\huggingface\datasets\dair-ai___emotion\split\0.0.0\cab853a1dbdf4c42c2b3ef2173804746df8825fe (last modified on Thu Dec 19 15:18:36 2024).


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
load_dataset(path='glue', name='mrpc')  # 从Hugging Face加载glue下的mrpc数据集

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'mrpc' at C:\Users\duanm\.cache\huggingface\datasets\glue\mrpc\0.0.0\bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Tue Oct 29 15:55:44 2024).


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
# Saves a dataset dict to a filesystem using either :class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``.
datasets_emotion.save_to_disk("../extra_dataset/save_datasets_DatasetDict")

Saving the dataset (0/1 shards):   0%|          | 0/16000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
# Loads a dataset that was previously saved using :meth:`Dataset.save_to_disk` from a dataset directory,
# or from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``.
dataset_load = load_from_disk("../extra_dataset/save_datasets_DatasetDict")
print(dataset_load)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [6]:
# 读取单个csv文件
load_dataset('csv', data_files='../extra_dataset/to_xxx/data.csv')  # DatasetDict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})

In [7]:
# 读取多个csv文件
load_dataset('csv', data_files=['../extra_dataset/to_xxx/data.csv', 
                                '../extra_dataset/to_xxx/data.csv'])  # DatasetDict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32000
    })
})

In [8]:
# 通过字典映射生成训练、测试、验证数据集
load_dataset('csv', data_files={'train': ['../extra_dataset/to_xxx/data.csv', 
                                          '../extra_dataset/to_xxx/data.csv'],
                                'test': '../extra_dataset/to_xxx/data.csv',
                                'valid': '../extra_dataset/to_xxx/data.csv'})  # DatasetDict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})

In [9]:
# 读取单个json文件
load_dataset('json', data_files='../extra_dataset/to_xxx/data.json')  # DatasetDict


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})

In [10]:
# 读取多个json文件
load_dataset('json', data_files=['../extra_dataset/to_xxx/data.json',
                                 '../extra_dataset/to_xxx/data.json'])  # DatasetDict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32000
    })
})

In [11]:
# 通过字典映射生成训练、测试、验证数据集
load_dataset('json', data_files={'train': ['../extra_dataset/to_xxx/data.json', 
                                           '../extra_dataset/to_xxx/data.json'],
                                 'test': '../extra_dataset/to_xxx/data.json',
                                 'valid': '../extra_dataset/to_xxx/data.json'})  # DatasetDict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})