# RTD Anomaly Detector with DistillBERT

#### Install all the required libraries
#####
All the files are under the modules folder: 
- distilbert_st 
- logs_handler
- rtd_dataset 

In [None]:
from modules.distillbert_st import *
from modules.logs_handler import *
from modules.metric_training_callback import *
from modules.prophet_model import *


#### Define LOGS_Handler class

In [2]:
normal_logs_file_path = 'logs/NormalLogs.txt'
anomalous_logs_file_path = 'logs\AnomoulousLogs.txt'
logs_handler = LogsHandler(normal_logs_file_path,anomalous_logs_file_path)


Define the operation to prepare logs for the classification 
- balance normal and anomalous logs 
- concate normal and anomalous logs previously balanced 
- reduce feature to evaluate from the logs 

In [3]:
# Concate and balance logs 
logs_handler._concate_logs()

# reduce feature of the logs 
logs_handler._reduce_feature_logs()


#### Define DistillBERT_ST Class 
This class contains all the methods to process the machine learning algorithm with the NLP DistillBERT

In [4]:
distilber_st = DistillBERT_ST(logs_handler)
predictor = Prophet_ST()


##### Training of the model 

1) Define tokenizer e Model: 
 - tokenizer: A tokenizer is a tool that converts raw text into a format that a machine learning model can understand. Specifically, it breaks down the text into smaller units called tokens. These tokens are then converted into numerical representations that can be fed into a model. 
 - model: In the context of NLP, a model is a machine learning algorithm that has been trained to understand and generate human language. The model takes the tokenized input and processes it to perform tasks such as classification, translation, or text generation.

In [None]:
distilber_st._define_model()
distilber_st._control_device()


2) Split logs in train and test set 

In [6]:
train_texts, test_texts, train_labels, test_labels = distilber_st._split_dataset(test_size=0.2, random_state=42)


3) Tokenize train and test set 

In [7]:
train_encodings, test_encodings = distilber_st._tokenization(train_texts, test_texts)


4) Create train and test dataset

In [8]:
train_dataset,test_dataset = distilber_st._create_train_and_test_dataset_rtd(train_encodings, test_encodings,train_labels,test_labels)


In [9]:
# Arguments of trainer variable
training_args = TrainingArguments(
    output_dir='./training_results',        # Directory to save the model and checkpoints
    num_train_epochs=10,                    # Number of training epochs
    per_device_train_batch_size=64,         # Batch size for training
    per_device_eval_batch_size=64,          # Batch size for evaluation
    warmup_steps=500,                       # Number of warmup steps
    weight_decay=0.01,                      # Weight decay for regularization
    logging_dir='./training_results/logs',  # Directory to save logs
    logging_steps=20                        # Number of steps between logging events
)
trainer = distilber_st._define_trainer(training_args=training_args, train_dataset=train_dataset, test_dataset=test_dataset )

# Callback for the training metrics 
metrics_callback = MetricsCallback(trainer)
trainer.add_callback(metrics_callback)


6) Start Training

In [None]:
trainer.train()


7) Evaluate Training

In [None]:
results = trainer.evaluate()
print(results)


8) Save model and tokenizer trained 

In [None]:
# Save model and tokenizer in a specific folder 
distilber_st._save_model('./training_model')

# Save metrics: F1Score, accuracy, precision and recall in a folder as png images 
metrics_callback.plot_and_save_metrics('./training_results/metrics12')


##### Prediction of new logs from file txt

1) Load Model and tokenizer previously saved to avoid the retrain of model 

In [13]:
distilber_st._define_model(load_path='training_model')
distilber_st._control_device()


2) Load logs_to_predict file 

In [14]:
distilber_st._define_logs_to_predict(normal_logs_file_path='logs\Logs_To_Predict.txt')


3) Predict the logs 

In [None]:
distilber_st._predict_logs(output_folder_path='results_prediction_logs')


In [None]:
predictor.predict_future_peaks(logs_file_path='logs/Logs_To_Predict.txt', output_folder_path='results_prediction_logs', periods=30)
