In [27]:
from datasets import load_dataset, DatasetDict
from datasets import Dataset
import pyarrow as pa

In [28]:
'''
Load a dataset

- if ``path`` is a canonical dataset on the HF Hub (ex: `glue`, `squad`)
              -> load the dataset builder from the dataset script in the github repository at huggingface/datasets
              e.g. ``'squad'`` or ``'glue'``.

- if ``path`` is a local directory (but doesn't contain a dataset script)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. ``'./path/to/directory/with/my/csv/data'``.
'''
dataset_all = load_dataset(path='dair-ai/emotion')  # 从Hugging Face加载dair-ai/emotion数据集
# 类型:datasets.dataset_dict.DatasetDict
dataset_all  # 类似Python字典

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [29]:
load_dataset(path='glue', name='mrpc')  # 从Hugging Face加载glue下的mrpc数据集

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [30]:
dataset_all.shape

{'train': (16000, 2), 'validation': (2000, 2), 'test': (2000, 2)}

In [31]:
dataset_all.data

{'train': MemoryMappedTable
 text: string
 label: int64
 ----
 text: [["i didnt feel humiliated","i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake","im grabbing a minute to post i feel greedy wrong","i am ever feeling nostalgic about the fireplace i will know that it is still on the property","i am feeling grouchy",...,"i should have been depressed but i was actually feeling inspired","i feel like not enough people my age actually think that most are pretty devastated that their s have come and gone","i get home i laze around in my pajamas feeling grouchy","i am feeling pretty homesick this weekend","i started out feeling really optimistic and driven for this paper coz it was gonna teach me the meaning and ways of being a leader"],["i need to do the best i possibly can do and even when i get out at i feel too listless to study like right now","i drove us to the car parts place and terry feels like im safe to drive again so yip

In [32]:
dataset_all.column_names

{'train': ['text', 'label'],
 'validation': ['text', 'label'],
 'test': ['text', 'label']}

In [33]:
# Remove one or several column(s) from each split in the dataset
dataset_all.remove_columns(['text'])  # train,validation,test都改变

DatasetDict({
    train: Dataset({
        features: ['label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['label'],
        num_rows: 2000
    })
})

In [34]:
train_dataset_arrow = Dataset(pa.Table.from_pydict(dataset_all['train'][:1000]))
validation_dataset_arrow = Dataset(pa.Table.from_pydict(dataset_all['validation'][:100]))
test_dataset_arrow = Dataset(pa.Table.from_pydict(dataset_all['test'][:100]))

# 从Dataset构造新的字典
new_dataset_all = DatasetDict({'train1': train_dataset_arrow,
                               'validation1': validation_dataset_arrow,
                               'test1': test_dataset_arrow})
new_dataset_all

DatasetDict({
    train1: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    validation1: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    test1: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})