In [75]:
from os import walk
import pandas as pd
import json

# Read all MIDD data files

In [45]:

def midd_files(DIRECTORY = 'midd', n_layouts=4):
   """
   Generator on all files in the MIDD datasets (all layouts)
   
   return a generator of tuples [(str,str,int)] : (
      name of the file,
      relative path to the file,
      id of the layout of the file
      )
   """
   def all_files_in_dir(directory):
      """return all file names stored in the specified directory"""
      filenames = next(walk(directory), (None, None, []))[2]  # [] if no file
      return filenames
   # iterate over layouts
   for layout_id in range(n_layouts):
      layout_dir = f'{DIRECTORY}/layout{layout_id}'
      for file_name in all_files_in_dir(layout_dir):
         file_path = f"{layout_dir}/{file_name}"
         yield file_name.split(".")[0], file_path, layout_id

# Conversion of string fields to interger

In [None]:
class IdMapper():
   def __init__(self):
      def gen_id(): # simple integer generator
         id = -1
         while True:
            id = id + 1
            yield id
      self.gen_id = gen_id()
      self.map = {}
      self.is_built = False

   def add_item(self,item:str):
      if self.is_built:
         raise ValueError('Cannot call add_item on a built IdMapper')
      if item not in self.map.keys():
         id = next(self.gen_id)
         self.map[item]=id
      return self.map[item]

   def build(self):
      if self.is_built:
         raise ValueError('IdMapper is already built')
      self.is_built = True
      return self.map

class Metadata():
   def __init__(self,convert=False):
      self.ner_tags = IdMapper()
      self.convert  = convert

   def add(self, record):
      if self.convert:
         converted_ner_tag = []
      for tag in record['ner_tag']:
         tag_id = self.ner_tags.add_item(tag)
         if self.convert:
            converted_ner_tag.append(tag_id)

      if self.convert:
         record['ner_tag'] = converted_ner_tag
      return record

   def build(self):
      return {
         'ner_tag' : self.ner_tags.build()
      }

# Final steps

In [86]:
def build_dataset(convert = False):
    
    dataset = []
    metadata = Metadata(convert=convert)
    for file_name, file_path, layout_id in midd_files():
        df = pd.read_csv(file_path)
        record = {   
            "name": file_name,
            "layout":layout_id,
            'ner_tag':df.Tag.tolist(),
            'token':df.Text.tolist()
        }
        record = metadata.add(record)
        dataset.append(record)

    return metadata.build(),dataset


In [89]:
def write_json(meta, dataset, DIRECTORY='data'):
    """Write the files to disk in JSON format"""
    with open(f'{DIRECTORY}/meta.json', 'w') as meta_file:  #open the file in write mode
        meta_file.write(json.dumps(meta))
    with open(f'{DIRECTORY}/midd.json', 'w') as data_file:  #open the file in write mode
        data_file.write(json.dumps(dataset))

In [92]:
# write both converted and non-converted version
for convert in [True,False]:
    meta,dataset = build_dataset(convert=convert)
    write_json(
        meta,
        dataset,
        DIRECTORY=f"data/{'converted' if convert else 'simple'}"
    )