In [1]:
import pandas as pd
import numpy as np
import ast

df = pd.read_pickle('df.pkl')

In [33]:
def recupera_list_ss(task, stage, fold, model, emb, ft):
    if ft == '-FT':
        with open(f"finetuning_ss/{task}_{stage}_{fold}_{emb}", 'r', encoding='utf-8') as f:
            for line in f:
                if model in line:
                    split_string = model + ': '
                    split_line = line.split(split_string)
                    pred_list = ast.literal_eval(split_line[1])
                    return pred_list
    else:
        with open(f"pre-trained_ss/{task}_{stage}_{fold}_{emb}", 'r', encoding='utf-8') as f:
            for line in f:
                if model in line:
                    split_string = model + ': '
                    split_line = line.split(split_string)
                    pred_list = ast.literal_eval(split_line[1])
                    return pred_list

In [34]:
def hits_at(k, true, preds):
    return np.mean([1 if t in preds[:k] else 0 for t in true])

def evaluate(task, fold, estagio, model, emb, ft='-FT'):
    file_path = f'splits/test_doi_{task}_{fold}_{estagio}.csv'
    doi_list = pd.read_csv(file_path)['node'].tolist()
    filtered_df = df[df['doi'].isin(doi_list)]
    k_dict = {"name": 50, "bioActivity": 5, "collectionSpecie": 50, "collectionType": 1, "collectionSite": 20}
    scores = []
    preds = recupera_list_ss(task, estagio, fold, model, emb, ft)
    i=0
    for _, row in filtered_df.iterrows():
        scores.append(hits_at(k_dict[task], row[task], preds[i]))
        i+=1 
    result = f"Tarefa: {task} | Estagio: {estagio} | Fold: {fold} | Hits@{k_dict[task]}: {np.mean(scores)}\n"
    with open('./results_ss/resultados'+ft+'-SS-'+emb+'-'+str(model)+'.txt', 'a', encoding='utf-8') as f:
        f.write(result)

In [35]:
folds = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
tarefas = ['collectionType', 'collectionSite', 'bioActivity', 'collectionSpecie', 'name']
models = ['qwen14b', 'phi14b','llama8b']
embedding_models = ['all-MiniLM-L6-v2', 'Qwen3-Embedding-0.6B', 'bge-m3', 'paraphrase-multilingual-MiniLM-L12-v2', 'multilingual-e5-large']

for model in models:
    for tarefa in tarefas:
        for fold in folds:
            for estagio in ['1st', '2nd', '3rd', '4th']:
                for emb in embedding_models:
                    evaluate(tarefa, fold, estagio, model, emb, ft='')

In [49]:
from collections import defaultdict
import re

def parse_line(line):
    pattern = r"Tarefa: (.*?) \| Estagio: (.*?) \| Fold: (\d+) \| (Hits@\d+): ([0-9.]+)"
    match = re.match(pattern, line.strip())
    if match:
        tarefa, estagio, fold, metrica, valor = match.groups()
        return tarefa, estagio, int(fold), metrica, float(valor)
    return None

def process_file(file_path):
    resultados = defaultdict(list)

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parsed = parse_line(line)
            if parsed:
                tarefa, estagio, _, metrica, valor = parsed
                resultados[(tarefa, estagio, metrica)].append(valor)

    # Gera resumo
    resumo = []
    r2 = {}
    for (tarefa, estagio, metrica), valores in sorted(resultados.items()):
        media = sum(valores) / len(valores)
        resumo.append((tarefa, estagio, metrica, len(valores), media))
        r2[(tarefa, estagio)] = round(media,2)

    return resumo, r2

def imprimir_resumo(resumo):
    print(f"{'Tarefa':<20} {'Estagio':<10} {'Metrica':<10} {'Folds':<6} {'Media':<10}")
    print("-" * 65)
    for tarefa, estagio, metrica, num_folds, media in resumo:
        print(f"{tarefa:<20} {estagio:<10} {metrica:<10} {num_folds:<6} {media:<10.4f}")
    print("-" * 65)


ft = '-FT'
for model in models:
    for emb in embedding_models:
        print('Model: ' + model)
        print('Embedding: ' + emb)
        caminho_arquivo = './results_ss/resultados'+ft+'-SS-'+emb+'-'+str(model)+'.txt'
        resumo, r2 = process_file(caminho_arquivo)
        tarefas = ['name', 'bioActivity', 'collectionSpecie', 'collectionSite', 'collectionType']
        estagios = ['1st', '2nd', '3rd', '4th']
        for tarefa in tarefas:
            for estagio in estagios:
                valor = r2[(tarefa, estagio)]  
                print("&", end=' ') 
                print(f'{valor:.2f}'.lstrip('0'), end =' ')
            print()


Model: qwen14b
Embedding: all-MiniLM-L6-v2
& .61 & .64 & .64 & .67 
& .70 & .72 & .74 & .74 
& .86 & .88 & .90 & .91 
& .62 & .62 & .64 & .66 
& .92 & .92 & .93 & .94 
Model: qwen14b
Embedding: Qwen3-Embedding-0.6B
& .59 & .62 & .62 & .63 
& .71 & .73 & .75 & .76 
& .89 & .90 & .91 & .92 
& .58 & .58 & .60 & .62 
& .92 & .92 & .93 & .94 
Model: qwen14b
Embedding: bge-m3
& .62 & .64 & .65 & .67 
& .71 & .73 & .75 & .76 
& .89 & .91 & .92 & .92 
& .62 & .63 & .64 & .66 
& .92 & .92 & .93 & .94 
Model: qwen14b
Embedding: paraphrase-multilingual-MiniLM-L12-v2
& .57 & .60 & .61 & .63 
& .70 & .72 & .73 & .74 
& .78 & .79 & .80 & .79 
& .61 & .61 & .63 & .65 
& .92 & .92 & .93 & .94 
Model: qwen14b
Embedding: multilingual-e5-large
& .61 & .64 & .64 & .66 
& .71 & .73 & .74 & .75 
& .89 & .91 & .91 & .92 
& .62 & .62 & .64 & .66 
& .92 & .92 & .93 & .94 
Model: phi14b
Embedding: all-MiniLM-L6-v2
& .65 & .68 & .69 & .70 
& .73 & .74 & .75 & .74 
& .83 & .84 & .86 & .87 
& .61 & .61 & .63 & .64