# Create a Huggingface dataset
This notebook is dedicated to show how to load the dataset using Huggingface

## Import the dataset csv files and create separate datasets

Document based dataset

In [4]:
from datasets import load_dataset, Features, Value, Sequence, ClassLabel

features = Features({
    "file_name": Value("string"),
    "Tokens": Sequence(Value("string")),
    # a list of ints (e.g. JSON array in the CSV like: "[1,2,3]")
    "ner_tags": Sequence(Value("int64")),
    "Labels": Sequence(Value("string")),
    "number_of_tokens": Value("int64"),
    "Language": Value("string"),
    "source": Value("string"),
    "Label_counts": Value("string"),
    "number_of_annotations": Value("int64"),
    "sentence_id": Value("string"),
    "doi": Value("string")
})

In [2]:
from datasets import load_dataset

dataset_files = load_dataset(
    "csv",
    data_files={
        "train": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/output/train_files_corpus.csv",
        "test": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/output/test_files_corpus.csv"
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
dataset_files

DatasetDict({
    train: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations', 'doi', 'Text'],
        num_rows: 318
    })
    test: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations', 'doi', 'Text'],
        num_rows: 31
    })
})

Sentence based dataset

In [5]:
dataset_sentences = load_dataset(
    "csv",
    data_files={
        "train": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/output/train_sentence_corpus.csv",
        "test": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/output/test_sentence_corpus.csv"
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [9]:
import json
import ast
def parse_lists(example):
    # Only parse when the raw is a string
    v = example["Labels"]
    if isinstance(v, str):
        example["Labels"] = ast.literal_eval(example["Labels"])
        example["Tokens"] = ast.literal_eval(example["Tokens"])
        example["ner_tags"] = ast.literal_eval(example["ner_tags"])   # becomes a real list
    return example

dataset_dict = dataset_sentences.map(parse_lists)
dataset_dict = dataset_dict.cast(features)  # now enforce the schema

Map:   0%|          | 0/318 [00:00<?, ? examples/s]

ValueError: malformed node or string: None

In [7]:
dataset_dict['train'][0]['Tokens']

['Title',
 ':',
 '\n',
 '2020_Helfrich_AGEE',
 '\n\n',
 'Abstract',
 ':',
 '\n',
 'Datenmeldung',
 'nach',
 'dem',
 'EGovG.']

## Save the datasets locally

In [8]:
#dataset_dict.save_to_disk("UC_dataset_fairagro/sentence_split")


Saving the dataset (0/1 shards):   0%|          | 0/2722 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/319 [00:00<?, ? examples/s]

In [10]:
dataset_files.save_to_disk("UC_dataset_fairagro/document_split")

Saving the dataset (0/1 shards):   0%|          | 0/318 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/31 [00:00<?, ? examples/s]

## Load and use the dataset differnet versions using Huggingface

In [38]:
# Later load by “version name”
from datasets import load_from_disk
sentence_split = load_from_disk("UC_dataset_IAA/sentence_split")
#document_split = load_from_disk("my_ner_dataset/document_split")

In [39]:
for i in sentence_split["train"]['Labels']:
    if "B-soilDepth" in i:
        print(i)

In [41]:
print(len(sentence_split["train"][26]["Labels"]))

48
