In [4]:
from datasets import load_dataset
from datasets import load_from_disk
from datasets import DownloadConfig

In [5]:
'''
Load a dataset

- if ``path`` is a canonical dataset on the HF Hub (ex: `glue`, `squad`)
              -> load the dataset builder from the dataset script in the github repository at huggingface/datasets
              e.g. ``'squad'`` or ``'glue'``.

- if ``path`` is a local directory (but doesn't contain a dataset script)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. ``'./path/to/directory/with/my/csv/data'``.
'''
datasets_seamex = load_dataset(path='seamew/ChnSentiCorp')  # 从Hugging Face
datasets_seamex  # 类型:datasets.dataset_dict.DatasetDict

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset chn_senti_corp/default to C:\Users\duanm\.cache\huggingface\datasets\seamew___chn_senti_corp\default\0.0.0\1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85...


Downloading:   0%|          | 0.00/3.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset chn_senti_corp downloaded and prepared to C:\Users\duanm\.cache\huggingface\datasets\seamew___chn_senti_corp\default\0.0.0\1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [6]:
datasets_seamex['train']  # 类型:datasets.arrow_dataset.Dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 9600
})

### 保存与加载

In [10]:
# Saves a dataset dict to a filesystem using either :class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``.
datasets_seamex.save_to_disk("dataset")

In [11]:
# Loads a dataset that was previously saved using :meth:`Dataset.save_to_disk` from a dataset directory,
# or from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``.
dataset_load = load_from_disk("dataset")
print(dataset_load)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})


### 导出为其他格式

In [13]:
# Exports the dataset(即datasets.arrow_dataset.Dataset) to csv
# to_csv_kwargs: Parameters to pass to pandas's :func:`pandas.DataFrame.to_csv`
datasets_seamex['train'].to_csv('to_data/data.csv')

# Export the dataset(即datasets.arrow_dataset.Dataset) to JSON Lines or JSON.
# to_json_kwargs: Parameters to pass to pandas's `pandas.DataFrame.to_json
datasets_seamex['train'].to_json('to_data/data.json',
                                 force_ascii=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3202787

In [41]:
# 读取单个csv文件
load_dataset('csv', data_files='to_data/data.csv')

Using custom data configuration default-3038a9aff263dc4e
Reusing dataset csv (C:\Users\duanm\.cache\huggingface\datasets\csv\default-3038a9aff263dc4e\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 9600
    })
})

In [42]:
# 读取多个csv文件
load_dataset('csv', data_files=['to_data/data.csv', 'to_data/data.csv'])

Using custom data configuration default-f061df7ddfdf13ec


Downloading and preparing dataset csv/default to C:\Users\duanm\.cache\huggingface\datasets\csv\default-f061df7ddfdf13ec\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\duanm\.cache\huggingface\datasets\csv\default-f061df7ddfdf13ec\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 19200
    })
})

In [43]:
# 通过字典映射生成训练、测试、验证数据集
load_dataset('csv', data_files={'train': ['to_data/data.csv', 'to_data/data.csv'],
                                'test': 'to_data/data.csv',
                                'valid': 'to_data/data.csv'})

Using custom data configuration default-93b712f1277dbbd6


Downloading and preparing dataset csv/default to C:\Users\duanm\.cache\huggingface\datasets\csv\default-93b712f1277dbbd6\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\duanm\.cache\huggingface\datasets\csv\default-93b712f1277dbbd6\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 19200
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 9600
    })
    valid: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 9600
    })
})

In [40]:
# 读取单个json文件
load_dataset('json', data_files='to_data/data.json')

Using custom data configuration default-fd60eda23f43650a
Reusing dataset json (C:\Users\duanm\.cache\huggingface\datasets\json\default-fd60eda23f43650a\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
})

In [44]:
# 读取多个json文件
load_dataset('json', data_files=['to_data/data.json', 'to_data/data.json'])

Using custom data configuration default-a80b7c09aa2bfcb5


Downloading and preparing dataset json/default to C:\Users\duanm\.cache\huggingface\datasets\json\default-a80b7c09aa2bfcb5\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\duanm\.cache\huggingface\datasets\json\default-a80b7c09aa2bfcb5\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 19200
    })
})

In [45]:
# 通过字典映射生成训练、测试、验证数据集
load_dataset('json', data_files={'train': ['to_data/data.json', 'to_data/data.json'],
                                 'test': 'to_data/data.json',
                                 'valid': 'to_data/data.json'})

Using custom data configuration default-2b072cf09a41560a


Downloading and preparing dataset json/default to C:\Users\duanm\.cache\huggingface\datasets\json\default-2b072cf09a41560a\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\duanm\.cache\huggingface\datasets\json\default-2b072cf09a41560a\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 19200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
})