# Create partitions for pretraining datasets and incremental datasets

## Imports and Setup

In [7]:
import sys
sys.path.append('..')

# imports
import os
import numpy as np
import json 
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from copy import deepcopy
import shutil
import random

SEED = [0, 1, 2]
# NOTE: to obtain the X-shot from the sota, set MIN_FREQ=2*X and TRAIN_RATIO=0.5
MIN_FREQ = 40 # 10 20 40
TRAIN_RATIO = 0.5
# set main directories
DATA = 'ontonotes_shimaoka' # bbn
SCENARIO = 'complete' # ['complete', 'single_child']
TRAIN_DATA = 'train.json'
DEV_DATA = 'dev.json'
TEST_DATA = f"test{'-12k' if DATA == 'bbn' else ''}.json"
SRC_DATA_DIR = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/{SCENARIO}'
# DST_DATA_DIR = os.path.expanduser(f'./{DATA}/complete')
DST_DATA_DIR = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/{SCENARIO}_subset_{MIN_FREQ}'
DST_DATA_DIR = os.path.join(DST_DATA_DIR, 'instance_{}')
ONTOLOGY_PATH = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/all_types.txt'


## Load ontology

In [8]:
# # load ontology
# type2id = utils.load_ontology(ONTOLOGY_PATH)
# types = list(type2id.keys())
# # create hierarchy tree
# tree = utils.create_tree(ONTOLOGY_PATH)
# tree.show()

## Create subset partitions

Create train/dev partitions such that:
- types with a training frequency < MIN_FREQ are excluded from the datasets
- each incremental type has MIN_FREQ instances (example with 100: 80 train, 20 dev)

In [9]:
# iterate over incremental training dirs
for dir in tqdm(os.listdir(SRC_DATA_DIR)):
  dirpath_src = os.path.join(SRC_DATA_DIR, dir)
  if os.path.isdir(dirpath_src):
    # iterate over incremental training single partitions
    for f in os.listdir(os.path.join(SRC_DATA_DIR, dir)):
      filepath_src = os.path.join(dirpath_src, f)
      print('Processing ', filepath_src, '...')
      with open(filepath_src, 'r') as src:
        lines = src.readlines()
        # check frequency
        if len(lines) >= MIN_FREQ:
          print('Type kept')
          # create an instance of the incremental dataset for each seed
          for seed in SEED:
            print('Creating instance of the dataset for seed', seed)
            # prepare dir
            dirpath_dst = os.path.join(DST_DATA_DIR.format(seed), dir)
            if not os.path.exists(dirpath_dst):
              os.makedirs(dirpath_dst, exist_ok=True)
            
            # save subset partitions
            filepath_train_dst = os.path.join(dirpath_dst, f)
            filepath_dev_dst = os.path.join(dirpath_dst, f.replace('_train_', '_dev_'))
            with open(filepath_train_dst, 'w') as dst_train, open(filepath_dev_dst, 'w') as dst_dev:
              # IMPORTANT: reset seed
              random.seed(seed)
              random.shuffle(lines)
              idx_split = int(MIN_FREQ * TRAIN_RATIO)
              train_lines = lines[:idx_split]
              dev_lines = lines[-idx_split:]
              # save
              dst_train.writelines(train_lines)
              dst_dev.writelines(dev_lines)
        else:
          print('Type discarded')
        print()
  else:
    # copy other files
    # shutil.copyfile(dirpath_src, os.path.join(DST_DATA_DIR, dir))
    pass



  0%|          | 0/15 [00:00<?, ?it/s]

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_location_transit/incremental_train_road.json ...
Type kept
Creating instance of the dataset for seed 0


  7%|▋         | 1/15 [00:00<00:07,  1.83it/s]

Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_person_artist/incremental_train_music.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_person_artist/incremental_train_director.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_person_artist/incremental_train_actor.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for

 27%|██▋       | 4/15 [00:00<00:01,  5.97it/s]

Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_product/incremental_train_weapon.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_product/incremental_train_car.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_product/incremental_train_computer.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/comp

 60%|██████    | 9/15 [00:01<00:00, 13.13it/s]

Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_language/incremental_train_programming_language.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_health/incremental_train_treatment.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_health/incremental_train_malady.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontono

 73%|███████▎  | 11/15 [00:01<00:00,  9.23it/s]

Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_event/incremental_train_sports_event.json ...
Type discarded

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_event/incremental_train_accident.json ...
Type discarded

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_event/incremental_train_natural_disaster.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_event/incremental_train_protest.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for se

100%|██████████| 15/15 [00:01<00:00,  8.81it/s]

Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_other_art/incremental_train_writing.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_location_geography/incremental_train_mountain.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontonotes_shimaoka/complete/sons_of_location_geography/incremental_train_body_of_water.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/ontono


