In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForLanguageModeling
import datasets
import pickle
import json
import pandas as pd
import random
import numpy as np
import torch

In [None]:
EXP = '1'

In [None]:
import os

newpath = 'experiments/EXP' + EXP

if not os.path.exists(newpath):
    os.makedirs(newpath)

if not os.path.exists(newpath + 'model'):
    os.makedirs(newpath + 'model')

In [None]:
torch.cuda.is_available()

### Graph

In [None]:
import networkx as nx

graph = nx.read_graphml("GRAPH_PATH.graphml")

### Test set

In [None]:
test_triples = set()

In [None]:
random.seed(42)

r = random.randint(29,49)
while len(test_triples) < r:
    nodes = random.sample(graph.nodes(), k=2)
    if not graph.has_edge(nodes[0], nodes[1]) and not graph.has_edge(nodes[1], nodes[0]):
        t = (nodes[0], 'no relation', nodes[1])
        test_triples.add(t)

In [None]:
len(test_triples)

In [None]:
edges = sorted(graph.edges(data=True))

In [None]:
edges_to_remove = set()
cnt = 0

while len(test_triples) < 500:
    edge = random.sample(edges, k=1)[0]
    node1 = edge[0]
    node2 = edge[1]
    triple = (node1, edge[2]['relation'], node2)
    test_triples.add(triple)
    t = (edge[0], edge[1])
    if graph.has_edge(edge[0], edge[1]):
        graph.remove_edge(edge[0], edge[1])
        cnt += 1
    edges_to_remove.add(t)

cnt

In [None]:
len(test_triples)

In [None]:
with open('experiments/EXP' + EXP + '/test_triples.pkl', 'wb') as file1:
    pickle.dump(test_triples, file1)

In [None]:
with open('experiments/EXP' + EXP + '/edges_to_remove.pkl', 'wb') as file1:
    pickle.dump(edges_to_remove, file1)

In [None]:
test_dataset = []
ground_truth = []

for t in test_triples:
    chat = []
    system = {"role": "system", "content": "You are a chatbot that has to predict the relationship between nodes."}
    user = {"role": "user", "content": "Which is the relationship between the node '" + t[0] + "' and the node '" + t[2] + "'?"}
    chat.append(system)
    chat.append(user)
    test_dataset.append(chat)

    assistant = {"role": "assistant", "content": t[1]}
    chat.append(assistant)
    ground_truth.append(chat)

In [None]:
with open('experiments/EXP' + EXP + '/test_dataset.pkl', 'wb') as file1:
    pickle.dump(test_dataset, file1)

In [None]:
with open('experiments/EXP' + EXP + '/ground_truth.pkl', 'wb') as file1:
    pickle.dump(ground_truth, file1)

### Train set

In [None]:
with open('experiments/EXP' + EXP + '/edges_to_remove.pkl', 'rb') as file:
    edges_to_remove = pickle.load(file)

In [None]:
cnt = 0
for e in edges_to_remove:
    if graph.has_edge(e[0], e[1]):
        graph.remove_edge(e[0], e[1])
        cnt += 1

cnt

In [None]:
graph.number_of_edges()

In [None]:
degree_centrality = list(nx.degree_centrality(graph).items())
deg = sorted(degree_centrality, key=lambda x: x[1], reverse=True)
del degree_centrality

In [None]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union

In [None]:
train_set = []
m_n_values = []
ego_nodes_tot = []

RADIUS_EGO = 2

num_chat = 0

random.seed(42)

ego1 = nx.ego_graph(graph, deg[0][0], radius=RADIUS_EGO)
m = int(nx.diameter(ego1))
n = int(ego1.number_of_nodes()/m)

random_path = nx.generate_random_paths(ego1, n, path_length=m)#, seed=42)
paths = []
for rp in random_path:
    paths.append(rp)
    num_chat += 1
l = len(paths)
if l > 0:
    train_set.append(paths)
    m_n = (m, n)
    m_n_values.append(m_n)
ego_nodes_tot.append(deg[0][0])

ego1_nodes = list(ego1.nodes())
del ego1

for node, value in deg[1:]:
    ego2 = nx.ego_graph(graph, node, radius=RADIUS_EGO)
    ego2_nodes = list(ego2.nodes())
    if jaccard_similarity(ego1_nodes, ego2_nodes) < 0.1:
        m = int(nx.diameter(ego2))
        n = int(ego2.number_of_nodes()/m)
        random_path = nx.generate_random_paths(ego2, n, path_length=m)#, seed=42)
        paths = []
        for rp in random_path:
            paths.append(rp)
            num_chat += 1

        train_set.append(paths)
        m_n = (m, n)
        m_n_values.append(m_n)
        ego_nodes_tot.append(node)

        del ego1_nodes
        ego1_nodes = ego2_nodes
        del ego2_nodes
        del ego2

In [None]:
len(train_set)

In [None]:
with open('experiments/EXP' + EXP + '/train_set.pkl', 'wb') as file1:
    pickle.dump(train_set, file1)

In [None]:
with open('experiments/EXP' + EXP + '/ego_nodes_tot.pkl', 'wb') as file1:
    pickle.dump(ego_nodes_tot, file1)

In [None]:
random.seed(42)

for ego in train_set:
    for rp in ego:
        if len(rp) > 2:
            no_rel = []
            relation_types = []
            for k in range(len(rp)-1):
                # relation_types.append(graph.edges[rp[k], rp[k+1]]['relation']) # Uncomment this line for WN18RR and YAGO3 graphs
                # relation_types.append(graph.edges[rp[k], rp[k+1]]['display_relation']) # Uncomment this line for PrimeKG graph
            for i in range(len(rp)-2):
                for j in range(i+2, len(rp)):
                    if rp[i] != rp[j] and not graph.has_edge(rp[i], rp[j]) and len(set(relation_types[i:j-1])) > 1:
                        no_rel.append((rp[i], rp[j]))
            for nr in no_rel:
                random_position = random.randint(0, len(rp))
                rp.insert(random_position, nr)

In [None]:
with open('experiments/EXP' + EXP + '/m_n_values.pkl', 'wb') as f:
    pickle.dump(m_n_values, f)

In [None]:
train_dataset = []
triples = set()

def create_message(role, content):
    return {"role": role, "content": content}

for walks in train_set:
    for walk in walks:
        chat = [create_message("system", "You are a chatbot that has to predict the relationship between nodes.")]
        
        for i, elem in enumerate(walk):
            if i == len(walk) - 1 and isinstance(elem, str):
                break
            
            if isinstance(elem, tuple):
                # a, b = elem # Uncomment this line for WN18RR and YAGO3 graphs

                # Uncomment these two lines for PrimeKG graph
                # a = graph.nodes[elem[0]]['node_name']
                # b = graph.nodes[elem[1]]['node_name']
                
                rel = 'no relation'
                triples.add((a, rel, b))
            else:
                for j in range(i + 1, len(walk)):
                    if isinstance(walk[j], str):
                        # Uncomment these two lines for WN18RR and YAGO3 graphs
                        # a, b = walk[i], walk[j]
                        # rel = graph[a][b]['relation']
                        
                        # Uncomment these three lines for PrimeKG graph
                        # a = graph.nodes[walk[i]]['node_name']
                        # b = graph.nodes[walk[j]]['node_name']
                        # rel = graph[walk[i]][walk[j]]['display_relation']
                        
                        triples.add((a, rel, b))
                        break
                else:
                    continue
                
            user_message = create_message("user", f"Which is the relationship between the node '{a}' and the node '{b}'?")
            assistant_message = create_message("assistant", rel)
            chat.extend([user_message, assistant_message])

        if len(chat) > 1 and chat not in train_dataset:
            train_dataset.append(chat)

In [None]:
with open('experiments/EXP' + EXP + '/train_val_triples.pkl', 'wb') as file1:
    pickle.dump(triples, file1)

In [None]:
import random

random.shuffle(train_dataset)

In [None]:
l = int(len(train_dataset)/10)
val_dataset = train_dataset[:l]
train_dataset = train_dataset[l:]

In [None]:
with open('experiments/EXP' + EXP + '/train_dataset.pkl', 'wb') as file1:
    pickle.dump(train_dataset, file1)

In [None]:
with open('experiments/EXP' + EXP + '/val_dataset.pkl', 'wb') as file2:
    pickle.dump(val_dataset, file2)

### Model and dataset loading

In [None]:
import pickle

with open('experiments/EXP' + EXP + '/train_dataset.pkl', 'rb') as file:
    train_dataset = pickle.load(file)

In [None]:
with open('experiments/EXP' + EXP + '/val_dataset.pkl', 'rb') as file:
    val_dataset = pickle.load(file)

In [None]:
for i, t in enumerate(train_dataset):
    if len(t) < 3:
        print(t)
        del train_dataset[i]

In [None]:
for i, t in enumerate(val_dataset):
    if len(t) < 3:
        print(t)
        del val_dataset[i]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sapienzanlp/Minerva-350m-base-v1.0")
model = AutoModelForCausalLM.from_pretrained("sapienzanlp/Minerva-350m-base-v1.0")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}\
                            {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}\
                            {{ '<|im_start|>assistant\n' }}{% endif %}"

In [None]:
from datasets import Dataset

dataset1 = Dataset.from_dict({"chat": train_dataset})
dataset1 = dataset1.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=True, add_generation_prompt=False)})

dataset2 = Dataset.from_dict({"chat": val_dataset})
dataset2 = dataset2.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=True, add_generation_prompt=False)})

### Training

In [None]:
training_args = TrainingArguments(
    output_dir='outputs',
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=250,
    save_total_limit=3,
    load_best_model_at_end=True,
    greater_is_better=False,
    do_eval=True,
    metric_for_best_model="eval_loss",
    eval_strategy="steps",
    logging_steps=250,
    warmup_steps=50,
    prediction_loss_only=True,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset1['formatted_chat'],
    eval_dataset=dataset2['formatted_chat'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

In [None]:
model.save_pretrained('experiments/EXP' + EXP + 'model')
tokenizer.save_pretrained('experiments/EXP' + EXP + 'model')

### Metrics

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('experiments/EXP' + EXP + 'model')
final_model = AutoModelForCausalLM.from_pretrained('experiments/EXP' + EXP + 'model')

In [None]:
with open('experiments/EXP' + EXP + '/test_dataset.pkl', 'rb') as file:
    test_dataset = pickle.load(file)

In [None]:
post_ft = []

for i in range(len(test_dataset)):
    prompt = []
    ground_truth = ''
    for m in test_dataset[i]:
        if m['role'] != 'assistant':
            prompt.append(m)
        else:
            ground_truth = m['content']
    inputs = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_dict=True, return_tensors="pt")
    inputs = {k: v for k, v in inputs.items()}

    if ground_truth != '':
        tok = tokenizer(ground_truth, return_tensors="pt")
        out = final_model.generate(**inputs, max_new_tokens=len(tok['input_ids'][0]), do_sample=True, num_return_sequences=10)
        generations = []
        for j in range(10):
            gen = tokenizer.decode(out[j][len(inputs["input_ids"][0]):])
            generations.append(gen)
    
        t = (test_dataset[i], generations)
        post_ft.append(t)

In [None]:
with open('experiments/EXP' + EXP + '/results_postft.pkl', 'wb') as file:
    pickle.dump(post_ft, file)

### Results

In [None]:
import pickle

with open('experiments/EXP' + EXP + '/results_postft.pkl', 'rb') as file:
    post_ft = pickle.load(file)

In [None]:
clean = []
for d, g in post_ft:
    gt = d[2]['content'].strip()
    gen = []
    for pred in g:
        p = pred.strip()
        if len(p) > len(gt):
            p = p[:len(gt)]
        gen.append(p)
    clean.append((d, gen))

In [None]:
mrr_list = []
for d, g in clean:
    c = d[2]['content'].strip()
    try:
        mrr = g.index(c) + 1
    except ValueError:
        mrr = 10
    mrr_list.append(1/mrr)

mrr_mean = sum(mrr_list) / len(mrr_list)
mrr_mean

In [None]:
hit1_list = []
for d, g in clean:
    c = d[2]['content'].strip()
    if c == g[0]:
        hit1_list.append(1)
    else:
        hit1_list.append(0)

hit1_mean = sum(hit1_list) / len(hit1_list)

hit3_list = []
for d, g in clean:
    c = d[2]['content'].strip()
    if c in g[:3]:
        hit3_list.append(1)
    else:
        hit3_list.append(0)

hit3_mean = sum(hit3_list) / len(hit3_list)

hit5_list = []
for d, g in clean:
    c = d[2]['content'].strip()
    if c in g[:5]:
        hit5_list.append(1)
    else:
        hit5_list.append(0)

hit5_mean = sum(hit5_list) / len(hit5_list)

hit7_list = []
for d, g in clean:
    c = d[2]['content'].strip()
    if c in g[:7]:
        hit7_list.append(1)
    else:
        hit7_list.append(0)

hit7_mean = sum(hit7_list) / len(hit7_list)

hit10_list = []
for d, g in clean:
    c = d[2]['content'].strip()
    if c in g:
        hit10_list.append(1)
    else:
        hit10_list.append(0)

hit10_mean = sum(hit10_list) / len(hit10_list)

print("Hit@1 = " + str(hit1_mean))
print("Hit@3 = " + str(hit3_mean))
print("Hit@5 = " + str(hit5_mean))
print("Hit@7 = " + str(hit7_mean))
print("Hit@10 = " + str(hit10_mean))

In [None]:
from sklearn.metrics import precision_recall_fscore_support
import re

# Uncomment for WN18RR graph
# class_labels = [
#     'derivationally related form', 'has part', 'hyponym', 'hypernym', 'instance hypernym',
#     'instance hyponym', 'member holonym', 'member meronym', 'member of domain topic',
#     'no relation', 'part of', 'synset domain topic of'
# ]

# Uncomment for YAGO3 graph
# class_labels = [
#     'actedIn', 'created', 'diedIn', 'directed', 'graduatedFrom', 'happenedIn', 'hasChild',
#     'hasGender', 'hasMusicalRole', 'hasWonPrize', 'influences', 'isAffiliatedTo', 'isConnectedTo',
#     'isLocatedIn', 'no relation', 'participatedIn', 'playsFor', 'wasBornIn', 'wroteMusicFor'
# ]

# Uncomment for PrimeKG graph
# class_labels = [
#     'associated with', 'contraindication', 'enzyme', 'expression absent', 'expression present',
#     'indication', 'interacts with', 'no relation', 'off-label use', 'parent-child',
#     'phenotype absent', 'phenotype present', 'ppi', 'side effect',
#     'synergistic interaction', 'target', 'transporter'
# ]

y_true = []
y_pred = []

for d, g in clean:
    true_label = d[2]['content']
    prediction_text = g[0]

    predicted_label = None
    for label in class_labels:
        if re.search(r'\b' + re.escape(label) + r'\b', prediction_text):
            predicted_label = label
            break

    predicted_label = predicted_label if predicted_label else "unknown"

    y_true.append(true_label)
    y_pred.append(predicted_label)

filtered_y_true = []
filtered_y_pred = []
for t, p in zip(y_true, y_pred):
    if p != "unknown":
        filtered_y_true.append(t)
        filtered_y_pred.append(p)

precision, recall, f1, support = precision_recall_fscore_support(
    filtered_y_true, filtered_y_pred, labels=class_labels, average=None
)

for i, label in enumerate(class_labels):
    print(f"Classe: {label}")
    print(f"  Precision: {precision[i]:.2f}")
    print(f"  Recall:    {recall[i]:.2f}")
    print(f"  F1-score:  {f1[i]:.2f}")
    print(f"  Support:   {support[i]}")

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
    filtered_y_true, filtered_y_pred, average='weighted'
)

print(f"Weighted Precision: {precision_weighted:.2f}")
print(f"Weighted Recall:    {recall_weighted:.2f}")
print(f"Weighted F1-score:  {f1_weighted:.2f}")