In [1]:
from datasets import Dataset
from datasets import load_dataset

In [2]:
dataset_all = load_dataset(path='dair-ai/emotion')
dataset_all

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

### remove_columns

In [3]:
# Remove one or several column(s) from each split in the dataset and the features associated to the column(s).
# The transformation is applied to all the splits of the dataset dictionary.
dataset_all.remove_columns(['text'])  # train,validation,test都改变

DatasetDict({
    train: Dataset({
        features: ['label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['label'],
        num_rows: 2000
    })
})

In [4]:
# Rename a column in the dataset and move the features associated to the original column under the new column name.
# The transformation is applied to all the datasets of the dataset dictionary.
dataset_all.rename_column("label", "label_new")

DatasetDict({
    train: Dataset({
        features: ['text', 'label_new'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label_new'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label_new'],
        num_rows: 2000
    })
})

### filter

In [5]:
# Apply a filter function to all the elements in the table in batches and update the table so that the dataset only includes examples according to the filter function.
# The transformation is applied to all the datasets of the dataset dictionary.
dataset_all.filter(lambda x: x["label"] == 1)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5362
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 704
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 695
    })
})

### map

In [6]:
def add_prefix(example):
    example["text"] = "Review: " + example["text"]
    return example

# Apply a function to all the elements in the table (individually or in batches) and update the table (if function does updated examples).
# The transformation is applied to all the datasets of the dataset dictionary.
dataset_all.map(add_prefix)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
def add_prefix1(examples):
    examples['text'] = ["Review: " + i for i in examples['text']]
    return examples

# 参数含义参考Dataset map方法
dataset_all.map(add_prefix1, batched=True, batch_size=8, remove_columns=['label'])

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2000
    })
})