In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/src')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json
import pandas as pd

In [3]:
def load_json(path):
  with open(path) as f:
    data = json.load(f)
  return data

def build_dataset(data, diction):
  rows = []
  for row in data:
    acronym = row['acronym']
    tokens = row['tokens']
    expansion = row['expansion']
    sentence = ' '.join(tokens)
    acronym_token = tokens[acronym]

    # define row format
    row['acronym_token'] = acronym_token
    row['sentence'] = sentence
    row['label'] = True
    rows.append(row)

    # Duplicate rows corresponding to the incorrect expansion
    possible_expansions = diction[acronym_token]
    for possible_expansion in possible_expansions:
      if possible_expansion != expansion:
        error_row = row.copy()
        error_row['expansion'] = possible_expansion
        error_row['label'] = False
        rows.append(error_row)

  return pd.DataFrame(rows, columns=['acronym_token', 'expansion', 'sentence', 'label'])


In [4]:
data = load_json('../dataset/raw/dev.json')
diction = load_json('../dataset/diction.json')
dataset = build_dataset(data, diction)
dataset

Unnamed: 0,acronym_token,expansion,sentence,label
0,FL,federated learning,"In , we investigated the FL loss function mini...",True
1,FL,flatten layer,"In , we investigated the FL loss function mini...",False
2,FL,fixated locations,"In , we investigated the FL loss function mini...",False
3,RF,random forest,The nature of the regression algorithm makes t...,True
4,RF,radio frequency,The nature of the regression algorithm makes t...,False
...,...,...,...,...
28281,LP,linear programming,"On the other hand , since is symmetric , and i...",True
28282,LP,label powerset,"On the other hand , since is symmetric , and i...",False
28283,LP,label propagation,"On the other hand , since is symmetric , and i...",False
28284,CFD,computational fluid dynamics,"Incidentally , the MAP estimate matches more c...",True


In [5]:
shuffled_df = dataset.sample(frac=1, random_state=42)
split_point = 22720
training_set = shuffled_df.iloc[:split_point]
testing_set = shuffled_df.iloc[split_point:]

In [7]:
training_set.to_parquet('../dataset/parquet/train.parquet', index=False)
testing_set.to_parquet('../dataset/parquet/test.parquet', index=False)