# Create partitions for pretraining datasets and incremental datasets

## Imports and Setup

In [20]:
import sys
sys.path.append('../../../entity_typing_analysis/')
import utils

# imports
import os
import numpy as np
import json 
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from copy import deepcopy
import shutil
import random
from collections import defaultdict

# set main directories
DATA = 'few_NERD'
TRAIN_DATA = 'train.json'
DEV_DATA = 'dev.json'
TEST_DATA = f"test{'-12k' if DATA == 'bbn' else ''}.json"
SRC_DATA_DIR = f"/home/remote_hdd/datasets/{DATA}"
DST_DATA_DIR = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/'
ONTOLOGY_PATH = os.path.join(SRC_DATA_DIR, f"all_types.txt")
if DATA == 'few_NERD':
  ONTOLOGY_PATH = ONTOLOGY_PATH.replace('all_types.txt', 'all_types_with_fathers.txt')
  SRC_DATA_DIR = SRC_DATA_DIR.replace(DATA, f'{DATA}/supervised_formatted')
  TRAIN_DATA = 'train_with_fathers.json'
  DEV_DATA = 'dev_with_fathers.json'

MIN_TEST_FREQ = 30
SEED = 1
random.seed(SEED)

## Load ontology

In [21]:
# load ontology
type2id = utils.load_ontology(ONTOLOGY_PATH)
types = list(type2id.keys())
type2id

{'/art': 0,
 '/art/broadcastprogram': 1,
 '/art/film': 2,
 '/art/music': 3,
 '/art/other': 4,
 '/art/painting': 5,
 '/art/writtenart': 6,
 '/building': 7,
 '/building/airport': 8,
 '/building/hospital': 9,
 '/building/hotel': 10,
 '/building/library': 11,
 '/building/other': 12,
 '/building/restaurant': 13,
 '/building/sportsfacility': 14,
 '/building/theater': 15,
 '/event': 16,
 '/event/attack_battle_war_militaryconflict': 17,
 '/event/disaster': 18,
 '/event/election': 19,
 '/event/other': 20,
 '/event/protest': 21,
 '/event/sportsevent': 22,
 '/location': 23,
 '/location/GPE': 24,
 '/location/bodiesofwater': 25,
 '/location/island': 26,
 '/location/mountain': 27,
 '/location/other': 28,
 '/location/park': 29,
 '/location/road_railway_highway_transit': 30,
 '/organization': 31,
 '/organization/company': 32,
 '/organization/education': 33,
 '/organization/government_governmentagency': 34,
 '/organization/media_newspaper': 35,
 '/organization/other': 36,
 '/organization/politicalparty

## Create partitions

In [22]:
# prepare paths
src_train_path = os.path.join(SRC_DATA_DIR, TRAIN_DATA)
dst_train_path = os.path.join(DST_DATA_DIR, TRAIN_DATA.replace('_with_fathers',''))
dst_test_path = os.path.join(DST_DATA_DIR, TEST_DATA.replace('-12k',''))

os.makedirs(DST_DATA_DIR, exist_ok=True)

if os.path.exists(dst_train_path):
    os.remove(dst_train_path)
if os.path.exists(dst_test_path):
    os.remove(dst_test_path)

freq_test = defaultdict(int)

# create train and test
with open(src_train_path, 'r') as src_train, open(dst_train_path, 'a') as dst_train, open(dst_test_path, 'a') as dst_test:
    lines = src_train.readlines()
    random.Random(SEED).shuffle(lines)
    for t in tqdm(lines):
        # read example
        example = json.loads(t)
        labels = example['y_str']
        # check labels
        is_test = False
        for label in labels:
            # check if the example is needed by the test set
            if freq_test[label] < MIN_TEST_FREQ:
                # append example to the test set
                dst_test.write(f'{json.dumps(example)}\n')
                # update counters
                for label_ in labels:
                    freq_test[label_] += 1
                is_test = True
                break
        # append example to the training set
        if not is_test:
            dst_train.write(f'{json.dumps(example)}\n')

# copy ontology
shutil.copy(ONTOLOGY_PATH, os.path.join(DST_DATA_DIR, 'all_types.txt'))

100%|██████████| 340387/340387 [00:10<00:00, 32893.17it/s]


'/home/remote_hdd/datasets_for_incremental_training/few_NERD/all_types.txt'

In [23]:
for k,v in freq_test.items():
  print(k,':',v)

/location : 210
/location/road_railway_highway_transit : 30
/location/GPE : 30
/product : 270
/product/ship : 30
/building : 240
/building/other : 30
/art : 180
/art/film : 30
/person : 240
/person/politician : 30
/other : 360
/other/language : 30
/product/train : 30
/person/other : 30
/location/island : 30
/other/disease : 30
/person/athlete : 30
/art/music : 30
/other/award : 30
/product/software : 30
/other/biologything : 30
/product/airplane : 30
/organization : 300
/organization/sportsleague : 30
/person/artist_author : 30
/organization/other : 30
/other/chemicalthing : 30
/event : 180
/event/sportsevent : 30
/other/god : 30
/other/astronomything : 30
/organization/media_newspaper : 30
/location/bodiesofwater : 30
/art/writtenart : 30
/organization/education : 30
/building/sportsfacility : 30
/location/other : 30
/organization/sportsteam : 30
/product/other : 30
/person/actor : 30
/location/mountain : 30
/art/other : 30
/organization/government_governmentagency : 30
/other/livingt