# Create a Huggingface dataset
This notebook is dedicated to show how to load the dataset using Huggingface

## Import the dataset csv files and create separate datasets

Document based dataset

In [1]:
from datasets import load_dataset, Features, Value, Sequence, ClassLabel

features = Features({
    "file_name": Value("string"),
    "Tokens": Sequence(Value("string")),
    # a list of ints (e.g. JSON array in the CSV like: "[1,2,3]")
    "ner_tags": Sequence(Value("int64")),
    "Labels": Sequence(Value("string")),
    "number_of_tokens": Value("int64"),
    "Language": Value("string"),
    "source": Value("string"),
    "Label_counts": Value("string"),
    "number_of_annotations": Value("int64"),
    "doi": Value("string"),
    "sentence_id": Value("string")
    
})

In [2]:
from datasets import load_dataset

dataset_files = load_dataset(
    "csv",
    data_files={
        "train": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/dataset_files/train_files_corpus.csv",
        "test": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/dataset_files/test_files_corpus.csv"
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
dataset_files

DatasetDict({
    train: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations', 'doi', 'Text'],
        num_rows: 318
    })
    test: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations', 'doi', 'Text'],
        num_rows: 31
    })
})

Sentence based dataset

In [2]:
dataset_sentences = load_dataset(
    "csv",
    data_files={
        "train": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/dataset_files/train_sentence_corpus.csv",
        "test": "/home/abdelmalak/Documents/FAIRagro/uc_repo/repo/pilot-uc-textmining-metadata/code/corpus_creation/dataset_files/test_sentence_corpus.csv"
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
import json
import ast
def parse_lists(example):
    # Only parse when the raw is a string
    v = example["Labels"]
    if isinstance(v, str):
        example["Labels"] = ast.literal_eval(example["Labels"])
        example["Tokens"] = ast.literal_eval(example["Tokens"])
        example["ner_tags"] = ast.literal_eval(example["ner_tags"])   # becomes a real list
    return example

dataset_dict = dataset_sentences.map(parse_lists)
dataset_dict = dataset_dict.cast(features)  # now enforce the schema

Map:   0%|          | 0/2722 [00:00<?, ? examples/s]

Map:   0%|          | 0/319 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2722 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/319 [00:00<?, ? examples/s]

In [5]:
dataset_dict['train'][0]

{'file_name': '98412.txt',
 'Tokens': ['Title',
  ':',
  '\n',
  '2020_Helfrich_AGEE',
  '\n\n',
  'Abstract',
  ':',
  '\n',
  'Datenmeldung',
  'nach',
  'dem',
  'EGovG.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'Labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'number_of_tokens': 12,
 'Language': 'de',
 'source': 'OpenAgrar',
 'Label_counts': 'Counter()',
 'number_of_annotations': 0,
 'doi': None,
 'sentence_id': '98412-0'}

## Save the datasets locally

In [8]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
dataset_dict.push_to_hub(
    "IT-ZBMED/Agriculture_NER_Dataset_for_FAIR_Metadata_Enrichment",
    config_name="sentence_split"
)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/IT-ZBMED/Agriculture_NER_Dataset_for_FAIR_Metadata_Enrichment/commit/b4c402dd08d45c7b66423cfb247f6a06080afbf5', commit_message='Upload dataset', commit_description='', oid='b4c402dd08d45c7b66423cfb247f6a06080afbf5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/IT-ZBMED/Agriculture_NER_Dataset_for_FAIR_Metadata_Enrichment', endpoint='https://huggingface.co', repo_type='dataset', repo_id='IT-ZBMED/Agriculture_NER_Dataset_for_FAIR_Metadata_Enrichment'), pr_revision=None, pr_num=None)

In [10]:
dataset_dict.save_to_disk("Agriculture_NER_Dataset_for_FAIR_Metadata_Enrichment/sentence_split")

Saving the dataset (0/1 shards):   0%|          | 0/2722 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/319 [00:00<?, ? examples/s]

## Load and use the dataset differnet versions using Huggingface

In [38]:
# Later load by “version name”
from datasets import load_from_disk
sentence_split = load_from_disk("UC_dataset_IAA/sentence_split")
#document_split = load_from_disk("my_ner_dataset/document_split")

In [39]:
for i in sentence_split["train"]['Labels']:
    if "B-soilDepth" in i:
        print(i)

In [41]:
print(len(sentence_split["train"][26]["Labels"]))

48


In [1]:
from datasets import load_dataset

ds = load_dataset("IT-ZBMED/Agriculture_NER_Dataset_for_FAIR_Metadata_Enrichment")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/325k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2722 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/319 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations', 'doi', 'sentence_id'],
        num_rows: 2722
    })
    test: Dataset({
        features: ['file_name', 'Tokens', 'ner_tags', 'Labels', 'number_of_tokens', 'Language', 'source', 'Label_counts', 'number_of_annotations', 'doi', 'sentence_id'],
        num_rows: 319
    })
})