### Установка библиотек

In [19]:
!pip install datasets --q

### Импорт библиотек и данных

In [20]:
import os
import re
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Dict

from datasets import Dataset, DatasetDict

In [21]:
# Загрузка файлов из репозитория
!git clone https://github.com/dialogue-evaluation/RuNNE

Cloning into 'RuNNE'...
remote: Enumerating objects: 1620, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 1620 (delta 17), reused 15 (delta 15), pack-reused 1598[K
Receiving objects: 100% (1620/1620), 2.59 MiB | 10.57 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [22]:
folders = ["train", "test", "dev"]

### Пользовательские функции

In [40]:
# Чтение файлов
def read_file(filepath, readlines=False) -> str:
    with open(filepath, "r") as f:
        if readlines:
            txt = f.readlines()
        else:
            txt = f.read()
    return txt

# Удаление \t, \n и пробелов в начале и конце строки
def delete_symbols(text: str) -> List:
    text = re.sub("\t|\n", " ", text).strip()
    return text.split()

# Получение списка сущностей с соотвествующей позицией в тексте
def sort_labeled_data(annotation_file: List) -> List:
    df_ann = pd.DataFrame([delete_symbols(i) for i in annotation_file 
                           if ";" not in i]) 
    df_ann[2] = df_ann[2].astype("int")
    df_ann[3] = df_ann[3].astype("int")
    grouped = df_ann.groupby([1, 2])[3].min().reset_index()
    return grouped.sort_values(by=2)[[1,2,3]].values

In [41]:
def check_isalnum(text):
        return any(i.isalnum() for i in text)

def keep_only_alnum(text):
    return "".join([i if i.isalnum() else " " for i in text]).strip()

# Удаление пунктуации из размеченных данных
def drop_punct(seq: List, labels: List):
    new_seq = []
    new_labels = []
    for i in range(len(seq)):
        if seq[i].isalnum():
            new_seq.append(seq[i])
            new_labels.append(labels[i])   
    return new_seq, new_labels

def drop_duplicate_tokens(seq, labels):
    new_seq = []
    new_labels = []
    for i in range(len(seq)):
        if (i != 0) & (seq[i-1] == seq[i]):
            continue
        else:
            new_seq.append(seq[i])
            new_labels.append(labels[i])
    return new_seq, new_labels

def prepare_sequences(seqs, labels):
    clear_tokens = [keep_only_alnum(i) if check_isalnum(i) else i for i in seqs]
    d_p_tokens, d_p_labels = drop_punct(clear_tokens, labels)
    return drop_duplicate_tokens(d_p_tokens, d_p_labels)

In [42]:
def split_text_on_labeled_tokens(text: str, labels: List) -> List:
    # Размечаем часть текста по его позиции
    def chunk_text_labeling(text: str, start: int, end: int, is_ner = False):
        chunk_iter = 0
        ner_chunk = text[start: end].split()
        for part_of_chunk in ner_chunk:
            split_text.append(part_of_chunk)
            if is_ner:
                if chunk_iter == 0:
                    ner_label.append("B-"+ner)
                else:
                    ner_label.append("I-"+ner)
                chunk_iter += 1
            else:
                ner_label.append("O") 
                
    init_start = 0
    split_text = []
    ner_label = []

    for ner, start, end in labels:
        if start > init_start:
            chunk_text_labeling(text, init_start, start)        
            chunk_text_labeling(text, start, end, True)
            init_start = end
        else:
            chunk_text_labeling(text, start, end, True)
            init_start = end
            
    return split_text, ner_label

In [43]:
# Конвертация строкового значения метки в соответствующий id
def map_label_to_id(ids_dict: Dict, labels: list) -> List:
    # :param ids_dict: {"age": 0, "event": 1.....}
    # :param labels: List of entities ["age", "event", "O"....]
    return [ids_dict[i] for i in labels]

### Преобразование данных и сохранение их в формате pickle

In [54]:
for folder in folders:
    print(folder)
    base_path = f"RuNNE/data/{folder}"
    temp_folder = os.listdir(base_path)
    
    # Получение списка аннотаций к файлам
    files_with_ann = [i for i in temp_folder if ".ann" in i]

    all_sequences = []
    all_labels = []
    
    for f_ann in tqdm(files_with_ann):
        # Получаем название файла, поменяв в конце расширение
        txt_file = f_ann.replace(".ann", ".txt")
        ann = read_file(base_path +"/"+ f_ann, readlines=True)
        txt = read_file(base_path +"/"+ txt_file)
        
        # Проверяем длину файла, поскольку есть пустые в папке dev
        if len(ann) == 0:
            continue
        labels = sort_labeled_data(ann)
        
        # Разбиваем тексты на токены и размечаем каждый из них
        split_text, ner_label = split_text_on_labeled_tokens(txt, labels)
        seq_split_indexes = [i for i, v in enumerate(split_text) if v == "."]
        
        # Добавление обработанных данных к итоговому списку
        prev = 0
        for i in seq_split_indexes:
            short_text = split_text[prev: i]
            short_label = ner_label[prev: i]
            
            clear_tokens, clear_label = prepare_sequences(short_text, short_label)
            
            all_sequences.append(clear_tokens)
            all_labels.append(clear_label)
            # Не берем во внимание точки в тексте 
            prev = i+1

    # Сохраняем данные в соотвуствующие файлы train, dev, test
    df_folder = pd.DataFrame({"tokens": all_sequences, "label": all_labels})
    with open(f'{folder}_data.pickle', 'wb') as f:
        pickle.dump(df_folder, f)
    print(f"\nFor folder <{folder}> prepared <{df_folder.shape[0]}> sequences")

train


100%|██████████| 461/461 [00:03<00:00, 139.02it/s]



For folder <train> prepared <2508> sequences
test


100%|██████████| 93/93 [00:00<00:00, 160.90it/s]



For folder <test> prepared <512> sequences
dev


100%|██████████| 323/323 [00:00<00:00, 550.47it/s]


For folder <dev> prepared <536> sequences





### Создание DatasetDict

In [55]:
dsd = DatasetDict()
for folder in folders:
    with open(f'{folder}_data.pickle', 'rb') as f:
        data = pickle.load(f)
    dsd[folder] = Dataset.from_pandas(data)

In [46]:
dsd

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2508
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 512
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 536
    })
})

### Создание словаря строковых сущностей

In [56]:
# Получаем список уникальных сущностей
for_df = []
for folder in folders:
    with open(f"{folder}_data.pickle", "rb") as f:
        for_df.append(pickle.load(f))
lbls = pd.concat(for_df)["label"].values

dd = dict()
ids = 0
for ll in lbls:
    for lbl in ll:
        if lbl not in dd:
            dd[lbl] = ids
            ids += 1

In [57]:
ll = [i for i in dd.keys() if i != "O"]            
ll_sort = (sorted(ll, key=lambda x: x.split("-")[1]))

new_dd = {k: v for v, k in enumerate(["O"] + ll_sort)}
reverse_dd = {v: k for k, v in new_dd.items()}

with open('id_to_label_map.pickle', 'wb') as f:
        pickle.dump(reverse_dd, f)

### Замена строковых меток числовыми

In [58]:
dsd_with_ids = dsd.map(
    lambda x: {"tags": [map_label_to_id(new_dd, i) for i in x["label"]]},
    batched=True, remove_columns = "label")

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

### Добавление датасета на HuggingFace

In [50]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [59]:
dsd_with_ids.push_to_hub("graviada/russian-ner-runne")



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
from datasets.load import load_dataset
data = load_dataset("graviada/russian-ner-runne")
data

Downloading readme:   0%|          | 0.00/515 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/graviada___parquet/graviada--russian-ner-runne-993ff78e0f84420b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/499k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/116k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/512 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/536 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/graviada___parquet/graviada--russian-ner-runne-993ff78e0f84420b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 2508
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 512
    })
    dev: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 536
    })
})

In [61]:
data['test'][7]

{'tokens': ['Его',
  'дебютный',
  'альбом',
  'Кто',
  'ты',
  '2000',
  'разошёлся',
  'тиражом',
  'более',
  'миллиона',
  'экземпляров',
  'и',
  'входит',
  'в',
  'список',
  'самых',
  'продаваемых',
  'альбомов',
  'в',
  'России'],
 'tags': [0, 0, 0, 56, 57, 11, 0, 0, 34, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7]}