In [35]:
import json
import torch
import custom_utils
import numpy as np
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

In [36]:
# read labels
with open("training_labels.json", "r") as json_file:
    labels = json.load(json_file)

# read nodes and edges
nodes, edges = custom_utils.gather_dataset("training")

In [37]:
# aggregate data from all dialogs
X, y, edge_idx = [], [], [] 
count = 0
for id in nodes.keys():
        X += nodes[id]
        y += labels[id]
        edge_idx += [[e[0] + count, e[1] + count] for e in edges[id]]
        count += len(labels[id])

In [62]:
from sklearn.model_selection import StratifiedShuffleSplit

# stratified split (we do it every epoch)
def split(spliter, X, y):
    train_idx, val_idx = list(spliter.split(X, y))[0]
    train_mask, val_mask = [False]*len(X), [False]*len(X)
    for idx in train_idx: train_mask[idx] = True
    for idx in val_idx: val_mask[idx] = True
    train_mask = torch.Tensor(train_mask).bool()
    val_mask = torch.Tensor(val_mask).bool()
    return train_mask, val_mask


sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=42)
train_mask, val_mask = split(sss, X, y)

In [40]:
# formatting
y = torch.Tensor(y).long()
edge_idx = torch.Tensor(edge_idx).long().transpose(0,1)

In [41]:
X = bert.encode(X, show_progress_bar=True, convert_to_tensor=True)

Batches: 100%|██████████| 2270/2270 [01:30<00:00, 25.11it/s]


In [42]:
num_features = X.shape[1]
num_classes = 2

In [65]:
from torch.nn import Linear
import torch.nn.functional as F

class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(num_features, 300)
        self.lin2 = Linear(300, 600)
        self.lin3 = Linear(600, num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin3(x)
        return x

In [57]:
from torch_geometric.nn import GCNConv, SAGEConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = SAGEConv(num_features, hidden_channels, aggr="lstm")
        self.conv2 = SAGEConv(hidden_channels, num_classes, aggr="lstm")

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [66]:
model = MLP()
# model = GCN(hidden_channels=500)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

In [67]:
def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(X)  # Perform a single forward pass.
      # out = model(X, edge_idx)  # Perform a single forward pass.
      loss = criterion(out[train_mask], y[train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(X)
      # out = model(X, edge_idx)
      pred = out.argmax(dim=1)  # Use the class with highest probability.

      # TP, FP, FN
      TP = ((pred[val_mask] == 1) & (y[val_mask] == 1)).sum()
      FP = ((pred[val_mask] == 1) & (y[val_mask] == 0)).sum()
      FN = ((pred[val_mask] == 0) & (y[val_mask] == 1)).sum()

      # Calculate precision, recall, and F1 score
      precision = TP / max((TP + FP), 1e-10)  # Avoid division by zero
      recall = TP / max((TP + FN), 1e-10)  # Avoid division by zero
      f1_score = 2 * (precision * recall) / max((precision + recall), 1e-10)  # Avoid division by zero

      # Calculate accuracy
      test_correct = pred[val_mask] == y[val_mask]
      test_acc = int(test_correct.sum()) / int(val_mask.sum())

      # return criterion(model(X, edge_idx)[val_mask], y[val_mask]), f1_score
      return criterion(model(X)[val_mask], y[val_mask]), f1_score

In [68]:
for epoch in range(1, 201):
    train_mask, val_mask = split(sss, X, y)
    train_loss = train()
    val_loss, f1_score = test()
    print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Val loss: {val_loss:.4f}, F1-score: {f1_score:.4f}')

Epoch: 001, Train Loss: 0.6664, Val loss: 0.4495, F1-score: 0.0000
Epoch: 002, Train Loss: 0.4509, Val loss: 0.6792, F1-score: 0.0000
Epoch: 003, Train Loss: 0.6797, Val loss: 0.4001, F1-score: 0.0000
Epoch: 004, Train Loss: 0.4008, Val loss: 0.4560, F1-score: 0.0000
Epoch: 005, Train Loss: 0.4542, Val loss: 0.4858, F1-score: 0.0000
Epoch: 006, Train Loss: 0.4836, Val loss: 0.4671, F1-score: 0.0000
Epoch: 007, Train Loss: 0.4653, Val loss: 0.4245, F1-score: 0.0000
Epoch: 008, Train Loss: 0.4234, Val loss: 0.3854, F1-score: 0.0000
Epoch: 009, Train Loss: 0.3845, Val loss: 0.3831, F1-score: 0.0000
Epoch: 010, Train Loss: 0.3809, Val loss: 0.3942, F1-score: 0.0000
Epoch: 011, Train Loss: 0.3906, Val loss: 0.3895, F1-score: 0.0000
Epoch: 012, Train Loss: 0.3850, Val loss: 0.3745, F1-score: 0.0000
Epoch: 013, Train Loss: 0.3691, Val loss: 0.3618, F1-score: 0.0000
Epoch: 014, Train Loss: 0.3569, Val loss: 0.3602, F1-score: 0.0000
Epoch: 015, Train Loss: 0.3564, Val loss: 0.3622, F1-score: 0.

In [69]:
test_acc, f1_score = test()
print(f'Test Accuracy: {test_acc:.4f}, F1-score: {f1_score:.4f}')

Test Accuracy: 0.3364, F1-score: 0.5692


In [70]:
test_nodes, test_edges = custom_utils.gather_dataset("test")

test_labels = {}
model.eval()
for id, sentences in test_nodes.items():
    X_test = bert.encode(sentences, convert_to_tensor=True)
    out = model(X_test)
    pred = out.argmax(dim=1)
    test_labels[id] = pred.tolist()

In [71]:
# # uncomment to generate test labels
# with open("test_labels.json", "w") as json_file:
#     json.dump(test_labels, json_file)