# Create a HuggingFace Dataset from DocRED

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence
import pandas as pd
import json

In [2]:
# JSON loading function
def load_json_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

In [3]:
# Load the JSON files
path_dev = 'dev.json'
path_train = 'train_annotated.json'
path_test = 'test.json'

dev_data = load_json_data(path_dev)
train_data = load_json_data(path_train)
test_data = load_json_data(path_test)

In [4]:
def extract(data):
  processed_data = {'id': [], 'tokens': [], 'ner_tags': []}
  id_counter = 0
  for item in data:
    for sent_id, sent in enumerate(item['sents']):
        tokens = sent
        ner_tags = ['O'] * len(tokens)  # Initiale Markierung als 'Outside'

        # Durchgehen der Entitäten und Aktualisierung der NER-Tags
        for entity in item['vertexSet']:
            for mention in entity:
                if mention['sent_id'] == sent_id:
                    start, end = mention['pos']
                    ner_tags[start] = 'B-' + mention['type']  # Beginn der Entität
                    for i in range(start + 1, end):
                        ner_tags[i] = 'I-' + mention['type']  # Innerhalb der Entität

        # Daten hinzufügen
        processed_data['id'].append(id_counter)
        processed_data['tokens'].append(tokens)
        processed_data['ner_tags'].append(ner_tags)

        id_counter += 1

  return processed_data

In [5]:
dev_extract = extract(dev_data)
train_extract = extract(train_data)
test_extract = extract(test_data)

In [6]:
dev_df = pd.DataFrame.from_dict(dev_extract)
train_df = pd.DataFrame.from_dict(train_extract)
test_df = pd.DataFrame.from_dict(test_extract)

data_df = pd.concat([dev_df, train_df, test_df], ignore_index=True)
data_df['id'] = data_df['id'].astype(str)

In [7]:
# O: Outside; B-*: Beginning; I-*: Inside; LOC: Location; ORG: Organization; PER: Person; NUM: Number; TIME: Time; MISC: Miscellaneous
unique_tags = ['O', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-NUM', 'I-NUM', 'B-TIME', 'I-TIME', 'B-MISC', 'I-MISC']

tag_to_id = {tag: id for id, tag in enumerate(unique_tags)}
id_to_tag = {id: tag for tag, id in tag_to_id.items()}

In [8]:
tag_feature = ClassLabel(names=list(unique_tags))

features = Features({
    'id': Value('string'),  # oder 'int64' falls Ihre IDs numerisch sind
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(tag_feature)
})

In [9]:
train_dataset = Dataset.from_dict(data_df.iloc[:len(train_df.index)], features=features)
test_dataset = Dataset.from_dict(data_df.iloc[:len(test_df.index)], features=features)
dev_dataset = Dataset.from_dict(data_df.iloc[:len(dev_df.index)], features=features)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': dev_dataset  # 'validation' ist der übliche Name für das Entwicklungsset in HuggingFace
})

In [14]:
dataset_dict['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-NUM', 'I-NUM', 'B-TIME', 'I-TIME', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

# Push the created HF-dataset to the HF-Hub

In [11]:
# First login to HF-account
# !huggingface-cli login

In [None]:
dataset_dict.push_to_hub("dennishauser/docred_ner", private=True)