In [6]:
import json
import torch
import custom_utils
import numpy as np
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

if torch.cuda.is_available():
    torch.set_default_device(0) 

In [7]:
# read labels
with open("training_labels.json", "r") as json_file:
    labels = json.load(json_file)

# read nodes and edges
dialogs, speakers, edges = custom_utils.gather_dataset("training", combine=False)

In [8]:
# aggregate data from all dialogs
X_train_dialog, y_train, X_train_speaker, train_edge_idx, train_edge_attr = [], [], [], [], []
X_test_dialog, y_test, X_test_speaker, test_edge_idx, test_edge_attr = [], [], [], [], []

count_train, count_test = 0, 0
for id in dialogs.keys():
        if id[0] == 'T': # dumb --> dialogs starting with T for test
                X_test_dialog += dialogs[id]
                X_test_speaker += speakers[id]
                y_test += labels[id]
                test_edge_idx += [[e[0] + count_test, e[2] + count_test] for e in edges[id]]
                test_edge_attr += [e[1] for e in edges[id]]
                count_test += len(labels[id]) 
        else:
                X_train_dialog += dialogs[id]
                X_train_speaker += speakers[id]
                y_train += labels[id]
                train_edge_idx += [[e[0] + count_train, e[2] + count_train] for e in edges[id]]
                train_edge_attr += [e[1] for e in edges[id]]
                count_train += len(labels[id])

In [9]:
# hot encode speakers name
switcher = {
        "PM" : [1,0,0,0],
        "ME" : [0,1,0,0],
        "UI" : [0,0,1,0],
        "ID" : [0,0,0,1]
}

# ordinal encoder (eda)
# switcher = {
#         "PM" : [1],
#         "ME" : [0],
#         "UI" : [0],
#         "ID" : [0]
# }

In [12]:
# train formatting
X_speaker = torch.Tensor([switcher[el] for el in X_train_speaker])
X_dialog = bert.encode(X_train_dialog, show_progress_bar=True, convert_to_tensor=True).to(X_speaker.device)
X_train = torch.cat((X_dialog, X_speaker), dim=1).numpy().tolist()
# edge_idx = torch.Tensor(train_edge_idx).long().transpose(0,1)
# edge_attr = bert.encode(train_edge_attr, show_progress_bar=True, convert_to_tensor=False)

# test formatting
X_test_speaker = torch.Tensor([switcher[el] for el in X_test_speaker])
X_test_dialog = bert.encode(X_test_dialog, show_progress_bar=True, convert_to_tensor=True).to(X_test_speaker.device)
X_test = torch.cat((X_test_dialog, X_test_speaker), dim = 1).numpy().tolist()
# test_edge_idx = torch.Tensor(test_edge_idx).long().transpose(0,1)
# test_edge_attr = bert.encode(test_edge_attr, show_progress_bar=True, convert_to_tensor=False)

Batches: 100%|██████████| 1588/1588 [00:29<00:00, 53.95it/s]
Batches: 100%|██████████| 683/683 [00:11<00:00, 60.03it/s]


In [13]:
# creating json files
with open('data/X_train.json', 'w') as json_file:
    json.dump(X_train, json_file)
with open('data/y_train.json', 'w') as json_file:
    json.dump(y_train, json_file)

with open('data/X_test.json', 'w') as json_file:
    json.dump(X_test, json_file)
with open('data/y_test.json', 'w') as json_file:
    json.dump(y_test, json_file)