In [2]:
import json
from pathlib import Path
import os 
import numpy as np
from tqdm import tqdm
import networkx as nx

from sentence_transformers import SentenceTransformer

import torch
import torch.nn as nn
from torch_geometric.utils.convert import from_networkx
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert = SentenceTransformer('all-MiniLM-L6-v2').to(device)
bert.train()
def embed_text(text):
    # Embeds text into 384-dimensional Space
    return bert.encode(text)

In [4]:
def extract_data(path,label_file=None):
    set_of_edge_attr = set()
    assert path != None
    # One-hot encode
    dict_speakers =  {'UI': np.array([1,0,0,0]), 'PM': np.array([0,1,0,0]), 'ME': np.array([0,0,1,0]), 'ID': np.array([0,0,0,1])}

    files = [file.split('.')[0] for file in os.listdir(path) if file.endswith('.json')]
    data = []
    vocab_of_texts_edges = set()
    if label_file!=None:
        with open(f"{label_file}.json", "r") as file:
            labels = json.load(file)
    

    for file_name in tqdm(files):
        with open(path +"/"+ f"{file_name}.txt", "r") as file:
            c = file.readline()
            while len(c) != 0:
                index1, text, index2 = c.split(' ')
                set_of_edge_attr.add(text)
                c = file.readline()

    list_of_edge_attr = list(set_of_edge_attr)
    dict_edge_attr = dict()
    for i in range(len(list_of_edge_attr)):
        vec = np.zeros(len(list_of_edge_attr))
        vec[i] = 1
        dict_edge_attr[list_of_edge_attr[i]] = vec


    for file_name in tqdm(files):

        # Loading Nodes
        with open(path +'/'+ f"{file_name}.json", "r") as file:
            file_data = json.load(file)
        graph = nx.Graph()

        N_nodes = len(file_data)
        graph.add_nodes_from(list(range(N_nodes)))

        # Loading Nodes attributes
        nodes_attr = [file_data[i]['text'] for i in range(N_nodes)]
        
        #Loading Nodes labels and one hot encoding them
        nodes_labels = np.zeros((N_nodes, 4))
        for i in range(N_nodes):
            nodes_labels[i] = dict_speakers[file_data[i]['speaker']]
        # Embedding Nodes attributes with bert

        # Extracting Edges
        edges_indices = []
        edges_attr = []
        with open(path +"/"+ f"{file_name}.txt", "r") as file:
            c = file.readline()
            while len(c) != 0:
                index1, text, index2 = c.split(' ')
                edges_indices.append((int(index1),int(index2)))

                #nodes_attr[int(index2)] = nodes_attr[int(index1)] +" . "+ nodes_attr[int(index2)]

                edges_attr.append(dict_edge_attr[text])
                edges_attr.append(dict_edge_attr[text])

                # edges_indices.append((int(index2)-1,int(index2)))
                # edges_attr.append(np.zeros((len(dict_edge_attr[text]))))

                c = file.readline()

        nodes_attr = embed_text(nodes_attr)
        
        # Concatenating the attributes
        nodes_attr = np.hstack([nodes_labels,nodes_attr])

        # Embedding edges features

        # Add edges list to graph
        graph.add_edges_from(edges_indices)


        data_loader = from_networkx(graph)
        if label_file != None:
            data_loader.y = torch.tensor(labels[file_name],dtype=torch.float)
        data_loader.x = torch.tensor(nodes_attr,dtype=torch.float)
        data_loader.edge_attr = torch.tensor(edges_attr, dtype=torch.float)
        data.append(data_loader)

    
    print(set_of_edge_attr)
    return data, files

In [5]:
training_data, files = extract_data('training','training_labels')

  0%|          | 0/97 [00:00<?, ?it/s]

100%|██████████| 97/97 [00:00<00:00, 1313.65it/s]
  data_loader.edge_attr = torch.tensor(edges_attr, dtype=torch.float)
100%|██████████| 97/97 [03:37<00:00,  2.24s/it]

{'Result', 'Q-Elab', 'Background', 'Alternation', 'Comment', 'Clarification_question', 'Continuation', 'Narration', 'Contrast', 'Conditional', 'Correction', 'Acknowledgement', 'Parallel', 'Question-answer_pair', 'Elaboration', 'Explanation'}





In [19]:
training_data[0].edge_attr.shape

torch.Size([790, 16])

In [8]:
Total_N = len(files)
print(Total_N)

97


In [6]:
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        torch.manual_seed(12345)

        self.linear1 = nn.Linear(388,128)#projections

        self.linear2 = nn.Linear(128,128) #transormatio n data

        self.conv1 = GATv2Conv(in_channels=128, out_channels=32, dropout=0.2,heads=4,edge_dim=16, aggr="mean")
        self.conv2 = GATv2Conv(in_channels=32*4, out_channels=128, dropout=0.2,heads=4, edge_dim=16, concat=False,aggr="max")

        self.fc = nn.Linear(128, 16)
        self.classifier = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, edge_index, edge_attr):
        x = x.to(torch.float)

        x = self.linear1(x)

        x = self.relu(x)

        x = self.dropout(x)

        h = self.conv1(x, edge_index=edge_index, edge_attr=edge_attr)
        h = self.tanh(h)

        h = self.conv2(h, edge_index=edge_index, edge_attr=edge_attr)
        h = self.tanh(h)

        out = self.fc(h)
        
        out = self.dropout(out)
        out = self.relu(out)
        out = self.classifier(out)


        return out, h

model = GCN().to(device)

print(model.parameters)

<bound method Module.parameters of GCN(
  (linear1): Linear(in_features=388, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=128, bias=True)
  (conv1): GATv2Conv(128, 32, heads=4)
  (conv2): GATv2Conv(128, 128, heads=4)
  (fc): Linear(in_features=128, out_features=16, bias=True)
  (classifier): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
  (tanh): Tanh()
  (dropout): Dropout(p=0.2, inplace=False)
)>


In [9]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

# Define the scheduler
model = GCN()
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([4]))  #Initialize the CrossEntropyLoss function.

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)  # Initialize the Adam optimizer.
scheduler = StepLR(optimizer, step_size=1, gamma=0.01)

def train(data):
    optimizer.zero_grad()  # Clear gradients.
    out, h = model(data.x, data.edge_index, data.edge_attr)  # Perform a single forward pass.
    loss = criterion(out.reshape((-1)), data.y)  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss, h

def test(data):
    y_pred, h = model(data.x, data.edge_index)
    accuracy = np.sum(y_pred>0 != data.y)/len(data.y)

for epoch in tqdm(range(100)):
    tmp_loss = 0
    for i in range(int(Total_N*0.8)):
        loss, h = train(training_data[i])
        tmp_loss+=loss
    scheduler.step()
    print("Train Loss : ",tmp_loss.item()/int(Total_N*0.8))

    accuracy = 0
    count = 0
    loss = 0
    confusion_matrix = np.zeros((2,2))
    for i in range(int(np.floor(Total_N*0.8)),Total_N):
        y_pred, h = model(training_data[i].x, training_data[i].edge_index, training_data[i].edge_attr)
        loss += criterion(y_pred.reshape((-1)),training_data[i].y).sum()
        y_pred = torch.sigmoid(y_pred)
        y_pred = (y_pred>0.5).reshape((-1)).to(torch.int)

        for j in range(len(y_pred)):
            confusion_matrix[int(y_pred[j])][int(training_data[i].y[j])] += 1
        
        #if i == Total_N-1:
        #    print(y_pred)
        #    print(training_data[i].y)
        accuracy += np.count_nonzero((y_pred-training_data[i].y)<0.5)
        count += y_pred.shape[0]
        #print(((y_pred-my_data[i].y)).shape)
        #print("Non zeros = ",((y_pred-my_data[i].y)<0.5).sum())
        #print(count)
    #print("Precision = ", 2*confusion_matrix[1,1]/(2*confusion_matrix[1,1]+confusion_matrix[1,0]+confusion_matrix[0,1]))
    #print(confusion_matrix)
    #print(loss)

    print("Dev Loss = ", loss.item()/(Total_N-int(np.floor(Total_N*0.8))))
    print("F1-Score = ", 2*confusion_matrix[1,1]/(2*confusion_matrix[1,1]+confusion_matrix[1,0]+confusion_matrix[0,1]))
    print(confusion_matrix)

  0%|          | 0/100 [00:00<?, ?it/s]

Train Loss :  1.02944024816736


  1%|          | 1/100 [00:06<11:04,  6.72s/it]

Dev Loss =  0.9854220390319824
F1-Score =  0.3345593243427326
[[6595.  256.]
 [9514. 2456.]]


  1%|          | 1/100 [00:08<13:21,  8.10s/it]


KeyboardInterrupt: 

In [197]:
print(2*confusion_matrix[1,1]/(2*confusion_matrix[1,1]+confusion_matrix[1,0]+confusion_matrix[0,1]))

0.5450488737781556


In [8]:
Epoch: 0, Loss: 0.12241587093990172
Epoch: 10, Loss: 0.07922085104758819
Epoch: 20, Loss: 0.0772609277797719
Epoch: 30, Loss: 0.06915923811600523
Epoch: 40, Loss: 0.06570792215201787
Epoch: 50, Loss: 0.057126596673168176
Epoch: 60, Loss: 0.05551746983318211
Epoch: 70, Loss: 0.05230997337835497
Epoch: 80, Loss: 0.05226752843568713
Epoch: 90, Loss: 0.051357696348241944
Dev Loss =  tensor(0.0001, grad_fn=<DivBackward0>)
Accuracy =  0.9994015161590637
F1 Score =  0.893870789618995
[[1.2141e+04 2.8740e+03]
 [9.0000e+00   1.4000e+01]]

SyntaxError: invalid syntax (4235973778.py, line 1)

In [29]:
test_data,test_files = extract_data('test',None)

100%|██████████| 40/40 [00:00<00:00, 1067.43it/s]
100%|██████████| 40/40 [02:08<00:00,  3.20s/it]

{'Result', 'Correction', 'Explanation', 'Background', 'Q-Elab', 'Contrast', 'Continuation', 'Elaboration', 'Acknowledgement', 'Comment', 'Narration', 'Parallel', 'Conditional', 'Alternation', 'Question-answer_pair', 'Clarification_question'}





In [31]:
test_labels = dict()
for i in range(len(test_files)):
    y_pred, h = model(test_data[i].x, test_data[i].edge_index,test_data[i].edge_attr)
    y_pred = (y_pred>0.5).reshape((-1)).to(torch.int)
    test_labels.__setitem__(test_files[i],y_pred.tolist())
print(test_labels)
with open("test_labels.json", "w") as file:
    json.dump(test_labels, file, indent=4)

{'ES2003a': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1], 'ES2003b': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,

In [None]:
data_graphs = extract_graph(data,data_of_edges)


NameError: name 'extract_graph' is not defined