In [1]:
import transformers as t
import datasets
from datasets import load_dataset

import torch
import torch_geometric
from torch_geometric.data import Data

import numpy as np
import pandas as pd

import spacy

from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
dataset = load_dataset("FranklinWillemen/mapa_plus")

Downloading readme: 100%|██████████| 570/570 [00:00<00:00, 282kB/s]


Downloading and preparing dataset None/None to C:/Users/Frank/.cache/huggingface/datasets/FranklinWillemen___parquet/FranklinWillemen--mapa_plus-4bbc20eeb61798b9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 326k/326k [00:00<00:00, 894kB/s]
Downloading data: 100%|██████████| 700k/700k [00:00<00:00, 1.53MB/s]]
Downloading data: 100%|██████████| 1.98M/1.98M [00:00<00:00, 2.59MB/s]
Downloading data files: 100%|██████████| 3/3 [00:05<00:00,  1.79s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 223.16it/s]
                                                                            

Dataset parquet downloaded and prepared to C:/Users/Frank/.cache/huggingface/datasets/FranklinWillemen___parquet/FranklinWillemen--mapa_plus-4bbc20eeb61798b9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 111.45it/s]


In [51]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['language', 'tokens', 'coarse_grained', 'fine_grained', 'pos', 'dep', 'heads'],
        num_rows: 1040
    })
    test: Dataset({
        features: ['language', 'tokens', 'coarse_grained', 'fine_grained', 'pos', 'dep', 'heads'],
        num_rows: 3081
    })
    train: Dataset({
        features: ['language', 'tokens', 'coarse_grained', 'fine_grained', 'pos', 'dep', 'heads'],
        num_rows: 8341
    })
})

In [53]:
df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

In [54]:
def integer_encode_list(series):
    label_encoder = LabelEncoder()
    
    # Concatenate all the lists in the series to fit the encoder
    concatenated = [item for sublist in series for item in sublist]
    label_encoder.fit(concatenated)
    
    # Transform each list separately and store in a new series
    encoded_series = series.apply(lambda x: label_encoder.transform(x))
    
    return encoded_series

In [55]:
def create_graph_instance(tokens, pos_encoded, heads, labels_encoded):
    num_nodes = len(tokens)
    
    # Create nodes and assign features
    x = torch.tensor(pos_encoded, dtype=torch.long).view(-1, 1)
    
    # Convert heads from string to int and create edges using head indices
    heads = list(map(int, heads))
    edge_index = [[head_idx, idx] for idx, head_idx in enumerate(heads)]
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    # Assign NER labels as ground truth
    y = torch.tensor(labels_encoded, dtype=torch.long)
    
    # Create Data instance for PyTorch Geometric
    graph = Data(x=x, edge_index=edge_index, y=y)
    
    return graph

In [56]:
pos_encoded_train = integer_encode_list(df_train['pos'])
labels_encoded_train = integer_encode_list(df_train['coarse_grained'])

pos_encoded_val = integer_encode_list(df_val['pos'])
labels_encoded_val = integer_encode_list(df_val['coarse_grained'])

pos_encoded_test = integer_encode_list(df_test['pos'])
labels_encoded_test = integer_encode_list(df_test['coarse_grained'])

In [64]:
graphs_train = [create_graph_instance(tokens, pos, heads, labels)
          for tokens, pos, heads, labels in zip(df_train['tokens'], pos_encoded_train, df_train['heads'], labels_encoded_train)]

graphs_val = [create_graph_instance(tokens, pos, heads, labels)
          for tokens, pos, heads, labels in zip(df_val['tokens'], pos_encoded_val, df_val['heads'], labels_encoded_val)]

graphs_test = [create_graph_instance(tokens, pos, heads, labels)
          for tokens, pos, heads, labels in zip(df_test['tokens'], pos_encoded_test, df_test['heads'], labels_encoded_test)]