In [592]:
import pandas as pd
output_test = pd.read_json("../eval-data/output_recente.json")
ground_test = pd.read_json("../eval-data/test_recente.json")

In [593]:
import re
def pre_process_data(df, is_test=False):
    df = df.fillna("")
    df = df.astype(str)
    df = df.applymap(lambda x: x.lower())
    # Retira pontuações. Útil para ganhar acurácia, O LLM as vezes retorna a frase corretamente, mas sem ponto final. 
    # Logo, pontuação no nosso caso é irrelevante.
    df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
    # Embaralha o conjunto de teste
    if is_test: 
        df = df.sample(frac=1).reset_index(drop=True)
    # Se não for o conjunto de teste, é o output. Remove as colunas start_time e end_time (não é necessário fazer isso no test_set, no arquivo já não há as colunas)
    else: 
        df =  df.drop(columns=['start_time', 'end_time'])
    return df

output_test = pre_process_data(output_test)
ground_test = pre_process_data(ground_test, True)

  df = df.applymap(lambda x: x.lower())
  df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
  df = df.applymap(lambda x: x.lower())
  df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', x))


In [594]:
import os
import json


def calcularResultados(tp, fp, fn, tn):
    precision = round(tp / (tp + fp) if (tp + fp) > 0 else 0, 4)
    
    recall = round(tp / (tp + fn) if (tp + fn) > 0 else 0, 4)
    f1 = round(2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0, 4)
    # acuracia para essa classe em questão. Total de acertos / total de exemplos
    accuracy = round((tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0, 4)
    return precision, recall, f1, accuracy



def calculateMetrics(pred_label, test_label, target_label):
    """
    Calcula métricas precisao, revocação, f1 e acurácia para o label 'category' ou 'action'
    """
    unique_labels = set(pred_label).union(set(test_label))
    results = []
    for label in unique_labels:
        # Condições para cada métrica
        # True Positive: previu a categoria e era a categoria
        tp = float(((pred_label == label) & (test_label == label)).sum())
        # False Positive: previu a categoria e não era
        fp = float(((pred_label == label) & (test_label != label)).sum())
        # False Negative: não previu a categoria, mas de fato era
        fn = float(((pred_label != label) & (test_label == label)).sum())
        # True Negatives: não previu a categoria e de fato não era
        tn = float(((pred_label != label) & (test_label != label)).sum())
        print(f"tp: {tp}, fp: {fp}, fn: {fn}, tn: {tn}")
        precision, recall, f1, accuracy = calcularResultados(tp, fp, fn, tn)
        
        new_json = {
            target_label: label,
            'precision': precision,
            'recall': recall,
            'f1-score': f1,
            'accuracy': accuracy,
        }
        results.append(new_json)
    return results



def media_por_label(teste, output,target_label):
    """
    Essa função faz o mesmo que a função "media_por_categoria" usada no repo 'llm_evaluation'
    Porém mais flexível, podendo ser usada para calcular o quanto que, para cada action ou category, o LLM acerta os outros paramêtros
    """
    unique_labels = set(output[target_label]).union(set(teste[target_label]))
    media = {label: 0 for label in unique_labels}
    media['evaluation'] = "avg accuracy of parameters"
    media['label'] = target_label
    total_labels = len(teste.columns)
    for label in unique_labels:
        indices = output[output[target_label] == label].index
        linhas_label_teste = teste.loc[indices]
        linhas_label_output = output.loc[indices]
        for (_, row_test), (_, row_out) in zip(linhas_label_teste.iterrows(), linhas_label_output.iterrows()):
            acertos = sum(val_test == val_out for val_test, val_out in zip(row_test.values, row_out.values))/total_labels
            media[label] += acertos
        print(media[label])
        print(len(linhas_label_output))
        media[label] = round(media[label] / len(linhas_label_output) if len(linhas_label_output) > 0 else 0, 4)
    media['label'] = target_label
    return [media]




def save_results(file, results):
    directory = os.path.dirname(file)
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(file, 'w') as f:
        json.dump(results, f, indent=4)


In [595]:
output_test

Unnamed: 0,intent,category,action,requirement,targets,magnitude
0,retrieve custom monitoring tools from the work...,construct,discover,,workstations in the design team,
1,ensure tracking of upload and download speeds ...,regulate,limit,,devices connected to the dormitory network,uploaddownload speed per device
2,ensure that guest wifi network have higher net...,regulate,prioritize,,guest wifi network,download speed 100mbps
3,list lan endpoints configured for vpn compatib...,construct,discover,,lan endpoints,
4,pinpoint lan endpoints with compatibility for ...,construct,discover,,lan endpoints,
...,...,...,...,...,...,...
495,duration of active sessions associated with de...,regulate,discover,,devices connected to the dormitory network,
496,iot devices across the campus should have its ...,construct,discover,,iot devices across the campus,
497,let workstations in the design team know that ...,transfer,push,updated antivirus definitions,workstations in the design team,
498,log duration of active sessions associated wit...,regulate,discover,,personal laptops in the faculty network,


In [596]:
ground_test

Unnamed: 0,intent,category,action,requirement,targets,magnitude
0,throttle minimum bandwidth to a maximum of 80 ...,regulate,limit,minimum bandwidth,devices connected to the dormitory network,80 mbps
1,push new application features to devices in th...,transfer,push,new application features,devices in the quality assurance lab,
2,pinpoint lan endpoints with compatibility for ...,construct,discover,realtime streaming capabilities,lan endpoints,
3,push custom monitoring tools to all laptops us...,transfer,push,custom monitoring tools,all laptops used by remote workers,
4,make sure upload speed is restricted for iot d...,regulate,limit,upload speed,iot device operations,10mbps per device
...,...,...,...,...,...,...
495,ensure that printers and scanners connected to...,regulate,prioritize,,printers and scanners connected to the network,download speed greater than 100mbps
496,video conferencing traffic must have priority ...,regulate,prioritize,,video conferencing traffic,upload speed greater than 10mbps
497,for enduser devices in the campus guest networ...,regulate,block,connections to online gaming environments,enduser devices in the campus guest network,
498,push custom monitoring tools to workstations i...,transfer,push,custom monitoring tools,workstations in the design team,


In [597]:
metrics = calculateMetrics(output_test['category'], ground_test['category'], 'category')
metrics


tp: 44.0, fp: 98.0, fn: 82.0, tn: 276.0
tp: 101.0, fp: 106.0, fn: 148.0, tn: 145.0
tp: 0.0, fp: 13.0, fn: 0.0, tn: 487.0
tp: 34.0, fp: 104.0, fn: 91.0, tn: 271.0


[{'category': 'transfer',
  'precision': 0.3099,
  'recall': 0.3492,
  'f1-score': 0.3284,
  'accuracy': 0.64},
 {'category': 'regulate',
  'precision': 0.4879,
  'recall': 0.4056,
  'f1-score': 0.443,
  'accuracy': 0.492},
 {'category': 'error',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.974},
 {'category': 'construct',
  'precision': 0.2464,
  'recall': 0.272,
  'f1-score': 0.2586,
  'accuracy': 0.61}]

In [598]:
media = media_por_label(ground_test, output_test, 'category')
media

30.66666666666669
142
40.49999999999999
207
2.1666666666666665
13
27.166666666666682
138


[{'transfer': 0.216,
  'regulate': 0.1957,
  'error': 0.1667,
  'construct': 0.1969,
  'evaluation': 'avg accuracy of parameters',
  'label': 'category'}]

In [599]:
test_size = 500
model_name = 'llama'
current_model = 'teste'
output_test = output_test.head(test_size)
ground_test = ground_test.head(test_size)
metrics_categoria = calculateMetrics(output_test['category'], ground_test['category'], 'category')
media_acertos_categoria = media_por_label(ground_test, output_test, 'category')
metrics_action = calculateMetrics(output_test['action'], ground_test['action'], 'action')
media_acertos_action = media_por_label(ground_test, output_test, 'action')
# Junta as metricas com a media de acerto e salva
results = [{'intents': test_size}] + metrics_categoria + media_acertos_categoria + metrics_action + media_acertos_action
print(results)
file_path = f"./results/{model_name}/{current_model}.json"
save_results(file_path, results)

tp: 44.0, fp: 98.0, fn: 82.0, tn: 276.0
tp: 101.0, fp: 106.0, fn: 148.0, tn: 145.0
tp: 0.0, fp: 13.0, fn: 0.0, tn: 487.0
tp: 34.0, fp: 104.0, fn: 91.0, tn: 271.0
30.66666666666669
142
40.49999999999999
207
2.1666666666666665
13
27.166666666666682
138
tp: 0.0, fp: 13.0, fn: 0.0, tn: 487.0
tp: 0.0, fp: 3.0, fn: 0.0, tn: 497.0
tp: 13.0, fp: 84.0, fn: 50.0, tn: 353.0
tp: 7.0, fp: 53.0, fn: 55.0, tn: 385.0
tp: 0.0, fp: 3.0, fn: 0.0, tn: 497.0
tp: 3.0, fp: 13.0, fn: 60.0, tn: 424.0
tp: 7.0, fp: 50.0, fn: 55.0, tn: 388.0
tp: 9.0, fp: 53.0, fn: 53.0, tn: 385.0
tp: 9.0, fp: 32.0, fn: 54.0, tn: 405.0
tp: 5.0, fp: 36.0, fn: 57.0, tn: 402.0
tp: 11.0, fp: 96.0, fn: 52.0, tn: 341.0
2.1666666666666665
13
0.6666666666666666
3
21.666666666666668
97
10.833333333333332
60
0.5
3
3.9999999999999996
16
8.0
57
15.166666666666663
62
9.666666666666666
41
7.500000000000001
41
20.333333333333336
107
[{'intents': 500}, {'category': 'transfer', 'precision': 0.3099, 'recall': 0.3492, 'f1-score': 0.3284, 'accuracy':

In [600]:
results

[{'intents': 500},
 {'category': 'transfer',
  'precision': 0.3099,
  'recall': 0.3492,
  'f1-score': 0.3284,
  'accuracy': 0.64},
 {'category': 'regulate',
  'precision': 0.4879,
  'recall': 0.4056,
  'f1-score': 0.443,
  'accuracy': 0.492},
 {'category': 'error',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.974},
 {'category': 'construct',
  'precision': 0.2464,
  'recall': 0.272,
  'f1-score': 0.2586,
  'accuracy': 0.61},
 {'transfer': 0.216,
  'regulate': 0.1957,
  'error': 0.1667,
  'construct': 0.1969,
  'evaluation': 'avg accuracy of parameters',
  'label': 'category'},
 {'action': '',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.974},
 {'action': 'track',
  'precision': 0.0,
  'recall': 0,
  'f1-score': 0,
  'accuracy': 0.994},
 {'action': 'push',
  'precision': 0.134,
  'recall': 0.2063,
  'f1-score': 0.1625,
  'accuracy': 0.732},
 {'action': 'prioritize',
  'precision': 0.1167,
  'recall': 0.1129,
  'f1-score': 0.1148,
  'accuracy