In [1]:
from src.utils import read_json
import json
import random

original_data = read_json('data/smartui/data_oos.json')

# Calculate the sizes for each dataset
total_size = len(original_data)
train_size = int(0.7 * total_size)
valid_size = int(0.2 * total_size)
test_size = total_size - train_size - valid_size

# Split the shuffled data into train, validation, and test datasets
train_data = original_data[:train_size]
valid_data = original_data[train_size:train_size + valid_size]
test_data = original_data[train_size + valid_size:]

# Save each dataset into separate JSON files with the new format
def convert_to_new_format(data):
    new_data = []
    for item in data:
        new_item = {
            "dialog": item["dialog"],
            "label": [item["label"]]
        }
        new_data.append(new_item)
    return new_data

train_data_new_format = convert_to_new_format(train_data)
valid_data_new_format = convert_to_new_format(valid_data)
test_data_new_format = convert_to_new_format(test_data)

with open('train.json', 'w') as f:
    json.dump(train_data_new_format, f, indent=4)

with open('val.json', 'w') as f:
    json.dump(valid_data_new_format, f, indent=4)

with open('test.json', 'w') as f:
    json.dump(test_data_new_format, f, indent=4)

In [1]:
from src.utils import read_json, write_json

train = read_json(r"data\smartui\train.json")
val = read_json(r"data\smartui\val.json")
test = read_json(r"data\smartui\test.json")

In [2]:
def get_label_set(*lists):
    label_set = {}
    count = 0
    for current_lst in lists:
        for dict in current_lst:
            for labels in dict['label']:
                if labels not in label_set:
                    label_set[labels] = count 
                    count += 1
    return label_set

In [3]:
from src.dataset import SentenceLabelDataset

label_set = get_label_set(train, test)
train_set = SentenceLabelDataset(train, labelSet=label_set)

In [4]:
label_set

{'Firewall Rules Setup': 0,
 'Event Support Request Form': 1,
 'Work At Home Request': 2,
 'Windows Domain Service Request': 3,
 'Add/Update/Delete DNS': 4,
 'oos': 5}

In [5]:
train_set[0]

{'text': "Hey, I'm having trouble with the firewall rules configuration. Can you lend me a hand with the Firewall Rules Setup request?",
 'class': ['Firewall Rules Setup'],
 'input_ids': [101,
  4931,
  1010,
  1045,
  1005,
  1049,
  2383,
  4390,
  2007,
  1996,
  2543,
  9628,
  3513,
  9563,
  1012,
  2064,
  2017,
  18496,
  2033,
  1037,
  2192,
  2007,
  1996,
  2543,
  9628,
  3513,
  16437,
  5227,
  1029,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'label': [0]}