In [1]:
from datasets import load_dataset
from datasets import load_from_disk

In [2]:
'''
Load a dataset

- if ``path`` is a canonical dataset on the HF Hub (ex: `glue`, `squad`)
              -> load the dataset builder from the dataset script in the github repository at huggingface/datasets
              e.g. ``'squad'`` or ``'glue'``.

- if ``path`` is a local directory (but doesn't contain a dataset script)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. ``'./path/to/directory/with/my/csv/data'``.
'''
datasets_emotion = load_dataset(path='emotion')  # 从Hugging Face加载emotion数据集
datasets_emotion

Using custom data configuration default
Reusing dataset emotion (C:\Users\dcdmm\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

### 保存与加载

In [26]:
# Saves a dataset dict to a filesystem using either :class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``.
datasets_emotion.save_to_disk("dataset")

In [27]:
# Loads a dataset that was previously saved using :meth:`Dataset.save_to_disk` from a dataset directory,
# or from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``.
dataset_load = load_from_disk("dataset")
print(dataset_load)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


### 导出为其他格式

In [28]:
# Exports the dataset(即datasets.arrow_dataset.Dataset) to csv
# to_csv_kwargs: Parameters to pass to pandas's :func:`pandas.DataFrame.to_csv`
datasets_emotion['train'].to_csv('to_data/data.csv')

# Export the dataset(即datasets.arrow_dataset.Dataset) to JSON Lines or JSON.
# to_json_kwargs: Parameters to pass to pandas's `pandas.DataFrame.to_json
datasets_emotion['train'].to_json('to_data/data.json',
                                  force_ascii=False)

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1901533

In [38]:
# Returns the dataset as a :class:`pandas.DataFrame`.
datasets_emotion['train'].to_pandas()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [29]:
# 读取单个csv文件
load_dataset('csv', data_files='to_data/data.csv')

Using custom data configuration default-71cba94fda45750b


Downloading and preparing dataset csv/default to C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-71cba94fda45750b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-71cba94fda45750b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 16000
    })
})

In [30]:
# 读取多个csv文件
load_dataset('csv', data_files=['to_data/data.csv', 'to_data/data.csv'])

Using custom data configuration default-56fbc80cf7cca551


Downloading and preparing dataset csv/default to C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-56fbc80cf7cca551\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-56fbc80cf7cca551\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 32000
    })
})

In [31]:
# 通过字典映射生成训练、测试、验证数据集
load_dataset('csv', data_files={'train': ['to_data/data.csv', 'to_data/data.csv'],
                                'test': 'to_data/data.csv',
                                'valid': 'to_data/data.csv'})

Using custom data configuration default-c8169aba93be8fe1


Downloading and preparing dataset csv/default to C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-c8169aba93be8fe1\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-c8169aba93be8fe1\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 16000
    })
    valid: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 16000
    })
})

In [32]:
# 读取单个json文件
load_dataset('json', data_files='to_data/data.json')

Using custom data configuration default-12dfef594b53d87e


Downloading and preparing dataset json/default to C:\Users\dcdmm\.cache\huggingface\datasets\json\default-12dfef594b53d87e\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\dcdmm\.cache\huggingface\datasets\json\default-12dfef594b53d87e\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})

In [33]:
# 读取多个json文件
load_dataset('json', data_files=['to_data/data.json', 'to_data/data.json'])

Using custom data configuration default-5a8306309f94e248


Downloading and preparing dataset json/default to C:\Users\dcdmm\.cache\huggingface\datasets\json\default-5a8306309f94e248\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\dcdmm\.cache\huggingface\datasets\json\default-5a8306309f94e248\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32000
    })
})

In [34]:
# 通过字典映射生成训练、测试、验证数据集
load_dataset('json', data_files={'train': ['to_data/data.json', 'to_data/data.json'],
                                 'test': 'to_data/data.json',
                                 'valid': 'to_data/data.json'})

Using custom data configuration default-3efc3456a63357c1


Downloading and preparing dataset json/default to C:\Users\dcdmm\.cache\huggingface\datasets\json\default-3efc3456a63357c1\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\dcdmm\.cache\huggingface\datasets\json\default-3efc3456a63357c1\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})

In [47]:
# 使用分号分隔(与csv文件非常相似)
with open('external_data/train.txt') as f:
    for line in f.readlines()[:10]:
        print(line, end='')

i didnt feel humiliated;sadness
i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake;sadness
im grabbing a minute to post i feel greedy wrong;anger
i am ever feeling nostalgic about the fireplace i will know that it is still on the property;love
i am feeling grouchy;anger
ive been feeling a little burdened lately wasnt sure why that was;sadness
ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny;surprise
i feel as confused about life as a teenager or as jaded as a year old man;fear
i have been with petronas for years i feel that petronas has performed well and made a huge profit;joy
i feel romantic too;love


In [50]:
emotions_local = load_dataset("csv",
                              data_files="external_data/train.txt",
                              sep=";",  # 指定分隔符为';'
                              names=["text", "label"])
emotions_local

Using custom data configuration default-0a7d7c33a8630fdd
Reusing dataset csv (C:\Users\dcdmm\.cache\huggingface\datasets\csv\default-0a7d7c33a8630fdd\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
})