# Try SciFit with Log dataset

The goal is to understand if the SciFit model can works with different and more complex dataset, like the logs we are using.
The approach is similar to the LogPrecis application, but more hand-down approach with low/none-application specific analysis
blind few-shot classification operations.

## We need to TEST
https://github.com/r-three/t-few/blob/master/src/pl_train.py

In [30]:
import polars as pl
import numpy as np
import pandas as pd

import json
from copy import deepcopy as dp

from datasets import load_dataset, IterableDataset 
from datasets import Dataset as hfDataset
from sentence_transformers.losses import CosineSimilarityLoss

import torch
import torch.nn as nn
import torchtext as tt
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader

from setfit import SetFitModel, Trainer, TrainingArguments

import math

In [2]:
log_location = {
    "train": "/home/rising/2024-06-21-category-1-sorted-cplabels.json",
    "evaluation": "/home/rising/2024-06-21-random-luis-matteo.json"
}

In [3]:
def import_from_json(log_location):
	data = list()
	with open(log_location) as log_file:
		log_lines = log_file.readlines()
		for line in log_lines:
			old_obj = json.loads(line)

			new_obj = dict()

			try:
				new_obj['label'] = old_obj['label']
				new_obj['log'] = dp(old_obj)
				new_obj['log'].pop('label', None)

				new_obj['log'] = json.dumps(new_obj['log'])
				new_obj['text-label'] = 'label-n-' + str(old_obj['label'])
			except KeyError:
				new_obj['log'] = dp(old_obj)
				new_obj['log'].pop('label', None)

				new_obj['log'] = json.dumps(new_obj['log'])

			data.append(new_obj)

	return data

In [4]:
df = pl.DataFrame(import_from_json(log_location["train"]))

In [5]:
df.head()

label,log,text-label
i64,str,str
3694672,"""{""kind"": ""Even…","""label-n-369467…"
4176,"""{""kind"": ""Even…","""label-n-4176"""
61648,"""{""kind"": ""Even…","""label-n-61648"""
151632,"""{""kind"": ""Even…","""label-n-151632…"
3280976,"""{""kind"": ""Even…","""label-n-328097…"


In [6]:
df.select(pl.col("label").value_counts()).unnest("label")

label,count
i64,u32
24992,8
98384,7
3695024,114
4512,93
3702864,7
…,…
147536,5
49168,3
-2,166
3702832,1


In [7]:
df.shape

(4473, 3)

In [8]:
train_over_test_ratio = math.floor(0.75 * df.shape[0])
train_over_test_ratio

3354

In [15]:
df_X_numpy = df.select(pl.col("*").exclude("label")).to_numpy()
df_y_numpy = df.select(pl.col("label")).to_numpy()
df_X_numpy, df_y_numpy

(array([['{"kind": "Event", "apiVersion": "audit.k8s.io/v1", "level": "RequestResponse", "auditID": "0ec16783-6b73-4e80-8a39-2dd9265d37fd", "stage": "ResponseComplete", "requestURI": "/api/v1/endpoints?allowWatchBookmarks=true&resourceVersion=2219852&timeout=8m58s&timeoutSeconds=538&watch=true", "verb": "watch", "user": {"username": "system:apiserver", "uid": "3debce64-170e-4de3-8a01-219a85be9879", "groups": ["system:masters"]}, "sourceIPs": ["::1"], "userAgent": "kube-apiserver/v1.28.7 (linux/amd64) kubernetes/c8dcb00", "objectRef": {"resource": "endpoints", "apiVersion": "v1", "namespace": null, "apiGroup": "core"}, "responseStatus": {"metadata": {}, "code": 200}, "requestReceivedTimestamp": "2024-05-28T09:15:35.273102Z", "stageTimestamp": "2024-05-28T09:24:33.274576Z", "annotations": {"authorization.k8s.io/decision": "allow", "authorization.k8s.io/reason": ""}}',
         'label-n-3694672'],
        ['{"kind": "Event", "apiVersion": "audit.k8s.io/v1", "level": "RequestResponse", "au

In [37]:
df_train = df.head(train_over_test_ratio)
df_test = df.head(-train_over_test_ratio)

X_train = df_X_numpy[train_over_test_ratio:]
y_train = df_y_numpy[train_over_test_ratio:]

X_test = df_X_numpy[:train_over_test_ratio]
y_test = df_y_numpy[:train_over_test_ratio]

In [17]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1119, 2), (1119, 1), (3354, 2), (3354, 1))

In [38]:
# TRY WITH torch.utils.data.Dataset

# class NLP_Log_Dataset(hfDataset):
#   def __init__(self, X, y):
#     self.X = X
#     self.y = y
#     #self.column_names = ("log", "label", "text-label")
#   def __len__(self):
#     return len(self.X)
#   def __getitem__(self, i):
#     return self.X[i], self.y[i]
# NOT WORKING
# def gen_train():
#     for i in df_train:
#         yield i
# def gen_test():
#     for i in df_test:
#         yield i

In [39]:
# NOT WORKING
# df_train_hf = IterableDataset.from_generator(gen_train)
# df_test_hf = IterableDataset.from_generator(gen_test)

# train_dataset = NLP_Log_Dataset(X_train, y_train)
# test_dataset = NLP_Log_Dataset(X_test, y_test)

In [44]:
ds_train = hfDataset.from_dict(df_train.to_dict())
ds_test = hfDataset.from_dict(df_test.to_dict())

In [45]:
ds_train, ds_test

(Dataset({
     features: ['label', 'log', 'text-label'],
     num_rows: 3354
 }),
 Dataset({
     features: ['label', 'log', 'text-label'],
     num_rows: 1119
 }))

In [46]:
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

args = TrainingArguments(
    batch_size=16,
    num_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train_hf,
    eval_dataset=ds_test_hf,
    metric="accuracy",
    column_mapping={"log": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/3354 [00:00<?, ? examples/s]

In [47]:
trainer.train()

***** Running training *****
  Num unique pairs = 8850130
  Batch size = 16
  Num epochs = 4
  Total optimization steps = 2212536


: 

In [None]:
trainer.evaluate()