In [None]:
import pandas as pd

def calculate_attention_mask(seq_length, input_len):
  return [1]*seq_length + [0]*(input_len - seq_length)



def prepare_dataset(annotated_bert):
  data = []

  input_len = 512
  overlap_window = 50

  start_token_id = [101]
  end_token_id = [102]
  outside_label = ['O']

  for input_id, label in annotated_bert:

      if len(label) > input_len - 2:

          data.append([input_id[0:input_len - 1] + end_token_id,
                      label[0:input_len - 1] + outside_label,
                      calculate_attention_mask(input_len, input_len),
                      0
                      ])


          start = input_len - overlap_window - 1

          while start < len(label) - input_len + 1 :

              data.append([start_token_id + input_id[start:start + input_len - 2] + end_token_id,
                          outside_label + label[start:start + input_len - 2] + outside_label,
                          calculate_attention_mask(input_len, input_len),
                          0
                          ])


              start = start + input_len - overlap_window - 2

              while label[start] != 'O':
                start -= 1

          data.append([start_token_id + input_id[start:len(label)],
                      outside_label + label[start:len(label)],
                      calculate_attention_mask(len(label) + 1 - start, input_len),
                      1
                      ])

      else:

          data.append([input_id,
                      label,
                      calculate_attention_mask(len(label) + 2, input_len),
                      1
                        ])


  return data




In [None]:
import torch
import ast

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):

        if isinstance(df["labels"].iloc[0], str):
            df["labels"] = df["labels"].apply(lambda x: ast.literal_eval(x))

        self.input_ids = torch.tensor(df["input_ids"].tolist(), dtype=torch.long)
        self.attention_mask = torch.tensor(df["attention_mask"].tolist(), dtype=torch.long)
        self.labels = torch.tensor(df["labels"].tolist(), dtype=torch.long)
        self.is_last_chunk = torch.tensor(df["is_last_chunk"].astype(int).tolist(), dtype=torch.long)


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
            "is_last_chunk": self.is_last_chunk[idx]

        }



In [None]:
import csv


def map_labels_to_ids(labels, corrected_mapping):

  return list(map(lambda x : corrected_mapping[x.replace("\t", "")], labels))


def get_entity_mapping(file_path):

  with open(file_path, 'r') as csvfile:
      reader = csv.reader(csvfile)
      keys = next(reader)
      values = next(reader)
      entity_mapping = dict(zip(keys, values))

  corrected_mapping = {key: int(value) for key, value in entity_mapping.items()}

  print(corrected_mapping)
  return corrected_mapping



def organize_data(data):

  input_ids = [entry[0] for entry in data]
  labels = [entry[1] for entry in data]
  attention_mask = [entry[2] for entry in data]
  is_last_chunk = [entry[3] for entry in data]

  df = pd.DataFrame({
      "input_ids": input_ids,
      "labels": labels,
      "attention_mask" : attention_mask,
      "is_last_chunk" : is_last_chunk
  })


  file_path = '/content/drive/My Drive/entity_mapping.csv'
  corrected_mapping = get_entity_mapping(file_path)


  df["labels"] = df["labels"].apply(lambda x: map_labels_to_ids(x, corrected_mapping))
  df["input_ids"] = df["input_ids"].apply(lambda x: x + ([0]*(512-len(x))))
  df["labels"] = df["labels"].apply(lambda x: x + ([corrected_mapping['O']]*(512-len(x))))
  df_copy = df.copy()

  dataset = CustomDataset(df)
  return dataset, df_copy, corrected_mapping








In [None]:
import csv


def map_labels_to_ids(labels, corrected_mapping):

  return list(map(lambda x : corrected_mapping[x.replace("\t", "")], labels))


def get_entity_mapping(file_path):

  with open(file_path, 'r') as csvfile:
      reader = csv.reader(csvfile)
      keys = next(reader)
      values = next(reader)
      entity_mapping = dict(zip(keys, values))

  corrected_mapping = {key: int(value) for key, value in entity_mapping.items()}

  print(corrected_mapping)
  return corrected_mapping



def organize_data(data):

  input_ids = [entry[0] for entry in data]
  labels = [entry[1] for entry in data]
  attention_mask = [entry[2] for entry in data]
  is_last_chunk = [entry[3] for entry in data]

  df = pd.DataFrame({
      "input_ids": input_ids,
      "labels": labels,
      "attention_mask" : attention_mask,
      "is_last_chunk" : is_last_chunk
  })


  file_path = '/content/drive/My Drive/entity_mapping.csv'
  corrected_mapping = get_entity_mapping(file_path)


  df["labels"] = df["labels"].apply(lambda x: map_labels_to_ids(x, corrected_mapping))
  df["input_ids"] = df["input_ids"].apply(lambda x: x + ([0]*(512-len(x))))
  df["labels"] = df["labels"].apply(lambda x: x + ([corrected_mapping['O']]*(512-len(x))))
  df_copy = df.copy()

  dataset = CustomDataset(df)
  return dataset, df_copy, corrected_mapping








In [None]:
import re

def data_preparation_pipeline(file_path, file_path_2):

  # classes, annotations = load_test_data_from_json(file_path, file_path_2)
  classes, annotations = read_data_from_json(file_path)

  print(len(annotations))
  correct_annotations = adjust_entity_boundaries(patterns, annotations)
  cleaned_data = clean_data(correct_annotations)
  aligned_data = align_data(cleaned_data)
  annotated_bert, tokenizer = label_data(aligned_data)
  data = prepare_dataset(annotated_bert)
  dataset, df, corrected_mapping = organize_data(data)


  return dataset, df, corrected_mapping


json_file_path = '/content/drive/My Drive/20k_batch1 1.json'
test_data_file_path = '/content/drive/My Drive/WD_(4129)/'
test_data_file_path_2 = '/content/drive/My Drive/Completed annotations - 2.27.2023/'
test_data_file_path_3 = '/content/drive/My Drive/7744_ner_v2_final.json'
dataset, df, corrected_mapping = data_preparation_pipeline(test_data_file_path_3, test_data_file_path_2)


