# Create a Huggingface dataset
This notebook is dedicated to show how to load the dataset using Huggingface

## Import the dataset csv files and create separate datasets

Document based dataset

In [5]:
from datasets import load_dataset, Features, Value, Sequence, ClassLabel

features = Features({
    "file_name": Value("string"),
    "Tokens": Sequence(Value("string")),
    # a list of ints (e.g. JSON array in the CSV like: "[1,2,3]")
    "ner_tags": Sequence(Value("int64")),
    "Labels": Sequence(Value("string")),
    "number_of_tokens": Value("int64"),
    "Language": Value("string"),
    "source": Value("string"),
    "Label_counts": Value("string"),
    "number_of_annotations": Value("int64"),
    "sentence_id": Value("string")
})

In [4]:
from datasets import load_dataset

dataset_files = load_dataset(
    "csv",
    data_files={
        "train": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/train_files_corpus.csv",
        "test": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/test_files_corpus.csv"
    },
      features=features  # optional if not standard comma
)

Generating train split: 0 examples [00:00, ? examples/s]


DatasetGenerationError: An error occurred while generating the dataset

In [2]:
dataset_files

DatasetDict({
    train: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations'],
        num_rows: 318
    })
    test: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations'],
        num_rows: 31
    })
})

Sentence based dataset

In [7]:
dataset_sentences = load_dataset(
    "csv",
    data_files={
        "train": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/train_sentence_corpus.csv",
        "test": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/test_sentence_corpus.csv"
    }
)

In [10]:
import json
import ast
def parse_lists(example):
    # Only parse when the raw is a string
    v = example["Labels"]
    if isinstance(v, str):
        example["Labels"] = ast.literal_eval(example["Labels"])
        example["Tokens"] = ast.literal_eval(example["Tokens"])
        example["ner_tags"] = ast.literal_eval(example["ner_tags"])   # becomes a real list
    return example

dataset_dict = dataset_sentences.map(parse_lists)
dataset_dict = dataset_dict.cast(features)  # now enforce the schema

Map: 100%|██████████| 2722/2722 [00:00<00:00, 3272.96 examples/s]
Map: 100%|██████████| 319/319 [00:00<00:00, 3682.92 examples/s]
Casting the dataset: 100%|██████████| 2722/2722 [00:00<00:00, 269622.51 examples/s]
Casting the dataset: 100%|██████████| 319/319 [00:00<00:00, 43779.30 examples/s]


In [15]:
dataset_dict['train'][0]['Labels']

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

## Save the datasets locally

In [17]:
dataset_dict.save_to_disk("my_ner_dataset/sentence_split")
#dataset_files.save_to_disk("my_ner_dataset/document_split")

Saving the dataset (1/1 shards): 100%|██████████| 2722/2722 [00:00<00:00, 202344.71 examples/s]
Saving the dataset (0/1 shards):   0%|          | 0/319 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 319/319 [00:00<00:00, 29256.40 examples/s]


## Load and use the dataset differnet versions using Huggingface

In [12]:
# Later load by “version name”
from datasets import load_from_disk
sentence_split = load_from_disk("my_ner_dataset/sentence_split")
document_split = load_from_disk("my_ner_dataset/document_split")