In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForLanguageModeling
import datasets
import pickle
import json
import random
import numpy as np

In [2]:
import torch

In [3]:
EXP = '2'

In [4]:
import os

newpath = 'experiments/EXP' + EXP

if not os.path.exists(newpath):
    os.makedirs(newpath)

if not os.path.exists(newpath + 'model'):
    os.makedirs(newpath + 'model')

### Preprocessing

In [11]:
import networkx as nx

graph = nx.read_graphml("../graph.graphml")

### Train set

In [8]:
with open('experiments/EXP1/m_n_values.pkl', 'rb') as file:
    m_n_values = pickle.load(file)

In [9]:
with open('experiments/EXP1/train_val_triples.pkl', 'rb') as file:
    train_val_triples_community = pickle.load(file)

In [10]:
with open('experiments/EXP1/test_triples.pkl', 'rb') as file:
    test_triples = pickle.load(file)

In [None]:
train_set = []

while len(train_set) < 5000:
    t_list = []
    node1 = str(random.randint(0, 129374))
    g = graph[node1]
    t_list.append(node1)
    l = len(g)
    i = 1
    path_len = 5
    while l > 0 and i < path_len and i <= l:
        node2 = list(g)[random.randint(0, l-1)]
        g = graph[node2]
        t_list.append(node2)
        l = len(g)
        i += 1
    if len(t_list) == 5:
        train_set.append(t_list)

In [53]:
for i, t in enumerate(train_set):
    for n in t:
        name = graph.nodes[n]['node_name']
        for tt in test_triples:
            if name == tt[0] or name == tt[2]:
                train_set[i] = []

In [56]:
t_set = []

for v in train_set:
    if v != []:
        t_set.append(v)

In [None]:
len(t_set)

In [58]:
random.shuffle(t_set)

In [61]:
train_set = t_set[:1826]

In [None]:
train_dataset = []
triples = set()

for walk in train_set:
    chat = []

    system = {"role": "system", "content": "You are a chatbot that has to predict the relationship between nodes."}
    chat.append(system)
    for i in range(len(walk)-1):
        rel = graph[walk[i]][walk[i+1]]['display_relation']
        a = graph.nodes[walk[i]]['node_name']
        b = graph.nodes[walk[i+1]]['node_name']

        triple = (a, rel, b)
        triples.add(triple)
        
        user = "Which is the relationship between the node '" + a + "' and the node '" + b + "'?"
        assistant = rel
        
        message = {}
        message["role"] = "user"
        message["content"] = user
        chat.append(message)

        message = {}
        message["role"] = "assistant"
        message["content"] = assistant
        chat.append(message)

    if chat not in train_dataset:
        train_dataset.append(chat)

In [None]:
len(triples)

In [67]:
with open('experiments/EXP' + EXP + '/train_val_triples.pkl', 'wb') as file1:
    pickle.dump(triples, file1)

In [75]:
import random

random.shuffle(train_dataset)

In [None]:
val_dataset = train_dataset[:182]
train_dataset = train_dataset[182:]

In [None]:
len(train_dataset)

In [None]:
len(val_dataset)

In [78]:
with open('experiments/EXP' + EXP + '/train_dataset.pkl', 'wb') as file1:
    pickle.dump(train_dataset, file1)

In [79]:
with open('experiments/EXP' + EXP + '/val_dataset.pkl', 'wb') as file2:
    pickle.dump(val_dataset, file2)

### Training

In [4]:
with open('experiments/EXP' + EXP + '/train_dataset.pkl', 'rb') as file:
    train_dataset = pickle.load(file)

In [5]:
with open('experiments/EXP' + EXP + '/val_dataset.pkl', 'rb') as file:
    val_dataset = pickle.load(file)

In [80]:
tokenizer = AutoTokenizer.from_pretrained("sapienzanlp/Minerva-350m-base-v1.0")
model = AutoModelForCausalLM.from_pretrained("sapienzanlp/Minerva-350m-base-v1.0")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from datasets import Dataset

dataset1 = Dataset.from_dict({"chat": train_dataset})
dataset1 = dataset1.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=True, add_generation_prompt=False)})

dataset2 = Dataset.from_dict({"chat": val_dataset})
dataset2 = dataset2.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=True, add_generation_prompt=False)})

In [None]:
training_args = TrainingArguments(
    output_dir='outputs',
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=250,
    save_total_limit=3,
    load_best_model_at_end=True,
    greater_is_better=False,
    do_eval=True,
    metric_for_best_model="eval_loss",
    eval_strategy="steps",
    logging_steps=250,
    warmup_steps=50,
    prediction_loss_only=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset1['formatted_chat'],
    eval_dataset=dataset2['formatted_chat'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

In [None]:
model.save_pretrained('experiments/EXP' + EXP + 'model')
tokenizer.save_pretrained('experiments/EXP' + EXP + 'model')

### Metrics

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('experiments/EXP' + EXP + 'model')
final_model = AutoModelForCausalLM.from_pretrained('experiments/EXP' + EXP + 'model')

In [None]:
with open('experiments/EXP1/test_dataset.pkl', 'rb') as file:
    test_dataset = pickle.load(file)

In [None]:
post_ft = []

for i in range(len(test_dataset)):
    prompt = []
    ground_truth = ''
    for m in test_dataset[i]:
        if m['role'] != 'assistant':
            prompt.append(m)
        else:
            ground_truth = m['content']
    inputs = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_dict=True, return_tensors="pt")
    inputs = {k: v for k, v in inputs.items()}

    if ground_truth != '':
        tok = tokenizer(ground_truth, return_tensors="pt")
        out = final_model.generate(**inputs, max_new_tokens=len(tok['input_ids'][0]), do_sample=True, num_return_sequences=10)
        generations = []
        for j in range(10):
            gen = tokenizer.decode(out[j][len(inputs["input_ids"][0]):])
            generations.append(gen)
    
        t = (test_dataset[i], generations)
        post_ft.append(t)

In [None]:
with open('experiments/EXP' + EXP + '/results_postft.pkl', 'wb') as file:
    pickle.dump(post_ft, file)

### Results

In [91]:
clean = []
for d, g in post_ft:
    gen = []
    for pred in g:
        gen.append(pred.strip())
    clean.append((d, gen))

In [None]:
mrr_list = []
for d, g in clean:
    c = d[2]['content']
    try:
        mrr = g.index(c)
    except ValueError:
        mrr = 9
    mrr_list.append(1/mrr)

mrr_mean = sum(mrr_list) / len(mrr_list)
mrr_mean

In [None]:
hit1_list = []
for d, g in clean:
    c = d[2]['content']
    if c == g[0]:
        hit1_list.append(1)
    else:
        hit1_list.append(0)

hit1_mean = sum(hit1_list) / len(hit1_list)

hit3_list = []
for d, g in clean:
    c = d[2]['content']
    if c in g[:3]:
        hit3_list.append(1)
    else:
        hit3_list.append(0)

hit3_mean = sum(hit3_list) / len(hit3_list)

hit5_list = []
for d, g in clean:
    c = d[2]['content']
    if c in g[:5]:
        hit5_list.append(1)
    else:
        hit5_list.append(0)

hit5_mean = sum(hit5_list) / len(hit5_list)

hit7_list = []
for d, g in clean:
    c = d[2]['content']
    if c in g[:7]:
        hit7_list.append(1)
    else:
        hit7_list.append(0)

hit7_mean = sum(hit7_list) / len(hit7_list)

hit10_list = []
for d, g in clean:
    c = d[2]['content']
    if c in g:
        hit10_list.append(1)
    else:
        hit10_list.append(0)

hit10_mean = sum(hit10_list) / len(hit10_list)

print("Hit@1 = " + str(hit1_mean))
print("Hit@3 = " + str(hit3_mean))
print("Hit@5 = " + str(hit5_mean))
print("Hit@7 = " + str(hit7_mean))
print("Hit@10 = " + str(hit10_mean))