In [549]:
import pandas as pd
output_test = pd.read_json("../eval-data/output_recente.json")
ground_test = pd.read_json("../eval-data/test_recente.json")

In [550]:
import re
def pre_process_data(df, is_test=False):
    df = df.fillna("")
    df = df.astype(str)
    df = df.applymap(lambda x: x.lower())
    # Retira pontuações. Útil para ganhar acurácia, O LLM as vezes retorna a frase corretamente, mas sem ponto final. 
    # Logo, pontuação no nosso caso é irrelevante.
    df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
    # Embaralha o conjunto de teste
    if is_test: 
        df = df.sample(frac=1).reset_index(drop=True)
    # Se não for o conjunto de teste, é o output. Remove as colunas start_time e end_time (não é necessário fazer isso no test_set, no arquivo já não há as colunas)
    else: 
        df =  df.drop(columns=['start_time', 'end_time'])
    return df

output_test = pre_process_data(output_test)
ground_test = pre_process_data(ground_test, True)

  df = df.applymap(lambda x: x.lower())
  df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
  df = df.applymap(lambda x: x.lower())
  df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', x))


In [551]:
import os
import json


def calcularResultados(tp, fp, fn, tn):
    precision = round(tp / (tp + fp) if (tp + fp) > 0 else 0, 4)
    
    recall = round(tp / (tp + fn) if (tp + fn) > 0 else 0, 4)
    f1 = round(2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0, 4)
    # acuracia para essa classe em questão. Total de acertos / total de exemplos
    accuracy = round((tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0, 4)
    return precision, recall, f1, accuracy



def calculateMetrics(pred_label, test_label, target_label):
    """
    Calcula métricas precisao, revocação, f1 e acurácia para o label 'category' ou 'action'
    """
    unique_labels = set(pred_label).union(set(test_label))
    results = []
    for label in unique_labels:
        # Condições para cada métrica
        # True Positive: previu a categoria e era a categoria
        tp = float(((pred_label == label) & (test_label == label)).sum())
        # False Positive: previu a categoria e não era
        fp = float(((pred_label == label) & (test_label != label)).sum())
        # False Negative: não previu a categoria, mas de fato era
        fn = float(((pred_label != label) & (test_label == label)).sum())
        # True Negatives: não previu a categoria e de fato não era
        tn = float(((pred_label != label) & (test_label != label)).sum())
        print(f"tp: {tp}, fp: {fp}, fn: {fn}, tn: {tn}")
        precision, recall, f1, accuracy = calcularResultados(tp, fp, fn, tn)
        
        new_json = {
            target_label: label,
            'precision': precision,
            'recall': recall,
            'f1-score': f1,
            'accuracy': accuracy,
        }
        results.append(new_json)
    return results



def media_por_label(teste, output,target_label):
    """
    Essa função faz o mesmo que a função "media_por_categoria" usada no repo 'llm_evaluation'
    Porém mais flexível, podendo ser usada para calcular o quanto que, para cada action ou category, o LLM acerta os outros paramêtros
    """
    unique_labels = set(output[target_label]).union(set(teste[target_label]))
    media = {label: 0 for label in unique_labels}
    media['evaluation'] = "avg accuracy of parameters"
    media['label'] = target_label
    total_labels = len(teste.columns)
    for label in unique_labels:
        indices = output[output[target_label] == label].index
        linhas_label_teste = teste.loc[indices]
        linhas_label_output = output.loc[indices]
        for (_, row_test), (_, row_out) in zip(linhas_label_teste.iterrows(), linhas_label_output.iterrows()):
            acertos = sum(val_test == val_out for val_test, val_out in zip(row_test.values, row_out.values))/total_labels
            media[label] += acertos
        print(media[label])
        print(len(linhas_label_output))
        media[label] = round(media[label] / len(linhas_label_output) if len(linhas_label_output) > 0 else 0, 4)
    media['label'] = target_label
    return [media]




def save_results(file, results):
    directory = os.path.dirname(file)
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(file, 'w') as f:
        json.dump(results, f, indent=4)


In [552]:
output_test.head(15)

Unnamed: 0,intent,category,action,requirement,targets,magnitude
0,configuration files has to be retrieved from c...,transfer,pull,,central media server,
1,pull custom monitoring tools from the local ne...,transfer,pull,,custom monitoring tools,
2,for iot devices integrated into the campus net...,regulate,block,video streaming services,iot devices integrated into the campus network,
3,error,,,,,
4,keep a record of realtime data throughput per ...,construct,discover,,personal laptops in the faculty network,
5,identifique dispositivos móveis que suportam c...,construct,discover,,,
6,notify devices in the quality assurance lab th...,transfer,push,secure peertopeer file sharing service and cri...,devices in the quality assurance lab,
7,realtime data throughput per device associated...,regulate,monitor,data throughput per device,research lab computers,
8,streaming services must have priority in the n...,regulate,prioritize,,streaming services,download speed greater than 100mbps
9,for devices connected via the staffonly networ...,regulate,block,,devices connected via the staffonly network,


In [553]:
ground_test.head(15)

Unnamed: 0,intent,category,action,requirement,targets,magnitude
0,give preference to research lab computers,regulate,prioritize,,research lab computers,latency lower than 10ms
1,video conferencing traffic must have priority ...,regulate,prioritize,,video conferencing traffic,upload speed greater than 10mbps
2,throttle download and upload speed to a maximu...,regulate,limit,download and upload speed,devices connected to the dormitory network,30 mbps
3,notify devices in the marketing department tha...,construct,advertise,configuration files,devices in the marketing department,
4,guest wifi network needs to be prioritized,regulate,prioritize,,guest wifi network,latency lower than 10ms
5,for public devices on the common area networks...,regulate,block,websites containing explicit content,public devices on the common area networks,
6,advertise the presence of bug fixes for existi...,construct,advertise,bug fixes for existing software,compatible devices,
7,secure peertopeer file sharing servicecritical...,transfer,pull,secure peertopeer file sharing servicecritical...,cloud,
8,streaming services must have priority in the n...,regulate,prioritize,,streaming services,download speed greater than 100mbps
9,ensure tracking of upload and download speeds ...,regulate,monitor,upload and download speeds per device,personal laptops in the faculty network,


In [554]:
metrics = calculateMetrics(output_test['category'], ground_test['category'], 'category')
metrics


tp: 0.0, fp: 4.0, fn: 2.0, tn: 9.0
tp: 0.0, fp: 1.0, fn: 0.0, tn: 14.0
tp: 3.0, fp: 1.0, fn: 7.0, tn: 4.0
tp: 1.0, fp: 5.0, fn: 2.0, tn: 7.0


[{'category': 'transfer',
  'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0,
  'accuracy': 0.6},
 {'category': '',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.9333},
 {'category': 'regulate',
  'precision': 0.75,
  'recall': 0.3,
  'f1-score': 0.4286,
  'accuracy': 0.4667},
 {'category': 'construct',
  'precision': 0.1667,
  'recall': 0.3333,
  'f1-score': 0.2222,
  'accuracy': 0.5333}]

In [555]:
media = media_por_label(ground_test, output_test, 'category')
media

0.6666666666666666
4
0.16666666666666666
1
1.5
4
1.1666666666666665
6


[{'transfer': 0.1667,
  '': 0.1667,
  'regulate': 0.375,
  'construct': 0.1944,
  'evaluation': 'avg accuracy of parameters',
  'label': 'category'}]

In [556]:
test_size = 15
model_name = 'llama'
current_model = 'teste'

metrics_categoria = calculateMetrics(output_test['category'], ground_test['category'], 'category')
media_acertos_categoria = media_por_label(ground_test.head(test_size), output_test.head(test_size), 'category')
metrics_action = calculateMetrics(output_test['action'], ground_test['action'], 'action')
media_acertos_action = media_por_label(ground_test.head(test_size), output_test.head(test_size), 'action')
# Junta as metricas com a media de acerto e salva
results = [{'intents': test_size}] + metrics_categoria + media_acertos_categoria + metrics_action + media_acertos_action
print(results)
file_path = f"./results/{model_name}/{current_model}.json"
save_results(file_path, results)

tp: 0.0, fp: 4.0, fn: 2.0, tn: 9.0
tp: 0.0, fp: 1.0, fn: 0.0, tn: 14.0
tp: 3.0, fp: 1.0, fn: 7.0, tn: 4.0
tp: 1.0, fp: 5.0, fn: 2.0, tn: 7.0
0.6666666666666666
4
0.16666666666666666
1
1.5
4
1.1666666666666665
6
tp: 0.0, fp: 1.0, fn: 0.0, tn: 14.0
tp: 0.0, fp: 1.0, fn: 1.0, tn: 13.0
tp: 1.0, fp: 0.0, fn: 5.0, tn: 9.0
tp: 0.0, fp: 4.0, fn: 1.0, tn: 10.0
tp: 0.0, fp: 1.0, fn: 1.0, tn: 13.0
tp: 0.0, fp: 0.0, fn: 2.0, tn: 13.0
tp: 0.0, fp: 0.0, fn: 2.0, tn: 13.0
tp: 1.0, fp: 4.0, fn: 0.0, tn: 10.0
tp: 0.0, fp: 2.0, fn: 1.0, tn: 12.0
0.16666666666666666
1
0.16666666666666666
1
0.8333333333333334
1
0.6666666666666666
4
0.16666666666666666
1
0
0
0
0
1.0
5
0.5
2
[{'intents': 15}, {'category': 'transfer', 'precision': 0.0, 'recall': 0.0, 'f1-score': 0, 'accuracy': 0.6}, {'category': '', 'precision': 0.0, 'recall': 0, 'f1-score': 0, 'accuracy': 0.9333}, {'category': 'regulate', 'precision': 0.75, 'recall': 0.3, 'f1-score': 0.4286, 'accuracy': 0.4667}, {'category': 'construct', 'precision': 0.1667

In [557]:
results

[{'intents': 15},
 {'category': 'transfer',
  'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0,
  'accuracy': 0.6},
 {'category': '',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.9333},
 {'category': 'regulate',
  'precision': 0.75,
  'recall': 0.3,
  'f1-score': 0.4286,
  'accuracy': 0.4667},
 {'category': 'construct',
  'precision': 0.1667,
  'recall': 0.3333,
  'f1-score': 0.2222,
  'accuracy': 0.5333},
 {'transfer': 0.1667,
  '': 0.1667,
  'regulate': 0.375,
  'construct': 0.1944,
  'evaluation': 'avg accuracy of parameters',
  'label': 'category'},
 {'action': '',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.9333},
 {'action': 'push',
  'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0,
  'accuracy': 0.8667},
 {'action': 'prioritize',
  'precision': 1.0,
  'recall': 0.1667,
  'f1-score': 0.2858,
  'accuracy': 0.6667},
 {'action': 'pull',
  'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0,
  'accuracy': 0.6667},
 {'action': 'monito