# Create partitions for pretraining datasets and incremental datasets

## Imports and Setup

In [1]:
import sys
sys.path.append('..')

# imports
import os
import numpy as np
import json 
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from copy import deepcopy
import shutil
import random

SEED = [0, 1, 2]
# NOTE: to obtain the X-shot from the sota, set MIN_FREQ=2*X and TRAIN_RATIO=0.5
MIN_FREQ = 40
TRAIN_RATIO = 0.5
# set main directories
DATA = 'figer'
SCENARIO = 'complete' # ['complete', 'single_child']
TRAIN_DATA = 'train.json'
DEV_DATA = 'dev.json'
TEST_DATA = f"test{'-12k' if DATA == 'bbn' else ''}.json"
SRC_DATA_DIR = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/{SCENARIO}'
# DST_DATA_DIR = os.path.expanduser(f'./{DATA}/complete')
DST_DATA_DIR = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/{SCENARIO}_subset_{MIN_FREQ}'
DST_DATA_DIR = os.path.join(DST_DATA_DIR, 'instance_{}')
ONTOLOGY_PATH = f'/home/remote_hdd/datasets_for_incremental_training/{DATA}/all_types.txt'


## Load ontology

In [42]:
# # load ontology
# type2id = utils.load_ontology(ONTOLOGY_PATH)
# types = list(type2id.keys())
# # create hierarchy tree
# tree = utils.create_tree(ONTOLOGY_PATH)
# tree.show()

## Create subset partitions

Create train/dev partitions such that:
- types with a training frequency < MIN_FREQ are excluded from the datasets
- each incremental type has MIN_FREQ instances (example with 100: 80 train, 20 dev)

In [43]:
# iterate over incremental training dirs
for dir in tqdm(os.listdir(SRC_DATA_DIR)):
  dirpath_src = os.path.join(SRC_DATA_DIR, dir)
  if os.path.isdir(dirpath_src):
    # iterate over incremental training single partitions
    for f in os.listdir(os.path.join(SRC_DATA_DIR, dir)):
      filepath_src = os.path.join(dirpath_src, f)
      print('Processing ', filepath_src, '...')
      with open(filepath_src, 'r') as src:
        lines = src.readlines()
        # check frequency
        if len(lines) >= MIN_FREQ:
          print('Type kept')
          # create an instance of the incremental dataset for each seed
          for seed in SEED:
            print('Creating instance of the dataset for seed', seed)
            # prepare dir
            dirpath_dst = os.path.join(DST_DATA_DIR.format(seed), dir)
            if not os.path.exists(dirpath_dst):
              os.makedirs(dirpath_dst, exist_ok=True)
            
            # save subset partitions
            filepath_train_dst = os.path.join(dirpath_dst, f)
            filepath_dev_dst = os.path.join(dirpath_dst, f.replace('_train_', '_dev_'))
            with open(filepath_train_dst, 'w') as dst_train, open(filepath_dev_dst, 'w') as dst_dev:
              # IMPORTANT: reset seed
              random.seed(seed)
              subset = random.sample(lines, MIN_FREQ)
              idx_split = int(MIN_FREQ * TRAIN_RATIO)
              train_lines = subset[:idx_split]
              dev_lines = subset[idx_split:]
              # save
              dst_train.writelines(train_lines)
              dst_dev.writelines(dev_lines)
        else:
          print('Type discarded')
        print()
  else:
    # copy other files
    # shutil.copyfile(dirpath_src, os.path.join(DST_DATA_DIR, dir))
    pass



  0%|          | 0/26 [00:00<?, ?it/s]

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_organization/incremental_train_sports_team.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_organization/incremental_train_sports_league.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_organization/incremental_train_fraternity_sorority.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_organization/incremental_train_terrorist_organization.json ..

  4%|▍         | 1/26 [00:00<00:14,  1.77it/s]

Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_organization/incremental_train_airline.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_person/incremental_train_musician.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_person/incremental_train_doctor.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_

  8%|▊         | 2/26 [00:01<00:23,  1.04it/s]

Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_person/incremental_train_engineer.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_living_thing/incremental_train_animal.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_internet/incremental_train_website.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/dataset

 19%|█▉        | 5/26 [00:02<00:07,  2.87it/s]

Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_building/incremental_train_hotel.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_building/incremental_train_restaurant.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_building/incremental_train_hospital.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_buil

 27%|██▋       | 7/26 [00:02<00:04,  4.25it/s]

Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_education/incremental_train_department.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_education/incremental_train_educational_degree.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_visual_art/incremental_train_color.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remo

 42%|████▏     | 11/26 [00:02<00:01,  7.95it/s]

Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_people/incremental_train_ethnicity.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_geography/incremental_train_mountain.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_geography/incremental_train_glacier.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_geo

 50%|█████     | 13/26 [00:02<00:01,  8.33it/s]

Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_religion/incremental_train_religion.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_event/incremental_train_election.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_event/incremental_train_terrorist_attack.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/data

 58%|█████▊    | 15/26 [00:02<00:01,  7.75it/s]

Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_event/incremental_train_natural_disaster.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_event/incremental_train_military_conflict.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_event/incremental_train_protest.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_finance/incremental_train_stock_exchange

 69%|██████▉   | 18/26 [00:03<00:01,  7.75it/s]

Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_product/incremental_train_airplane.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_government/incremental_train_political_party.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_government/incremental_train_government.json ...


 77%|███████▋  | 20/26 [00:03<00:00,  8.03it/s]

Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_transportation/incremental_train_road.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_metropolitan_transit/incremental_train_transit_line.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_broadcast/incremental_train_tv_channel.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /h

 88%|████████▊ | 23/26 [00:03<00:00, 10.19it/s]

Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_medicine/incremental_train_drug.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_location/incremental_train_cemetery.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_location/incremental_train_city.json ...
Type kept
Creating instance of the dataset for seed 0
Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2

Processing  /home/remote_hdd/datasets_for_incremental_training/figer/complete/sons_of_location/incremental_train_bridge.json ...
Type kept
Cr

100%|██████████| 26/26 [00:04<00:00,  5.47it/s]

Creating instance of the dataset for seed 1
Creating instance of the dataset for seed 2




