In [None]:
import json
import pandas as pd
import transformers
import torch
from transformers import (BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, get_cosine_schedule_with_warmup)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from statistics import mean

# Data Loading & Preprocessing
#Tram Data
data_path = '/path-to-data/'
with open(data_path) as f:
    data = json.loads(f.read())

raw = pd.DataFrame(data['sentences'])
raw


In [None]:
mappings = raw['mappings'].explode().dropna().apply(pd.Series)
mappings


In [None]:
df = pd.concat((raw['text'], mappings['attack_id'].str.extract(r"(?P<attack_id>T\d+)(\.(?P<subclass_id>\d+))?")), axis=1)
df

In [None]:
df['attack_id'] = df.apply(lambda row: row['attack_id'] if pd.isna(row['subclass_id']) else f"{row['attack_id']}.{row['subclass_id']}", axis=1)
df

In [None]:
#path to y
csv_path = '/path-to-second-data/'
csv_data = pd.read_csv(csv_path)
csv_data

In [None]:
# Keeping only necessary columns
csv_data = csv_data[['attack_id', 'text']]
csv_data

In [None]:
attack_ids_list = [str(attack_id).strip() for attack_id in csv_data['attack_id'].tolist()]
attack_ids_list

In [None]:
attack_ids_count = len(attack_ids_list)
print("Number of elements in attack_ids_list:", attack_ids_count)

In [None]:
# Concatenate dataframes from JSON and CSV
final_df = pd.concat([df, csv_data], ignore_index=True)
final_df

In [None]:
final_df['attack_id'].value_counts(dropna=False)

In [None]:
classes_of_interest = attack_ids_list
positive_data = final_df[final_df['attack_id'].isin(classes_of_interest)]
negative_data = final_df[final_df['attack_id'].isna()].sample(1000).fillna('none')
data = pd.concat((positive_data, negative_data))
data

In [None]:
positive_data = final_df[final_df['attack_id'].isin(classes_of_interest)]
negative_data = final_df[final_df['attack_id'].isna()].sample(1000).fillna('none')
data = pd.concat((positive_data, negative_data))
data

In [None]:
import transformers
import torch
from transformers import (BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, get_cosine_schedule_with_warmup)
cuda = torch.device('cuda')
tokenizer = BertTokenizer.from_pretrained("jackaduma/SecBERT", max_length=512)

In [None]:
x_tokens = tokenizer(data['text'].tolist(), return_tensors='pt', padding='max_length', truncation=True, max_length=512).input_ids
x_tokens

In [None]:
# Label Mapping
index_to_label = dict(enumerate(data['attack_id'].unique()))
index_to_label

In [None]:
import csv

# Define the path for the CSV file
csv_file_path = 'index_to_label_mapping2.csv'

# Save the index_to_label dictionary to the CSV file
with open(csv_file_path, mode='w', newline='') as csv_file:
    fieldnames = ['Index', 'Attack_ID']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    # Write the data rows
    for index, attack_id in index_to_label.items():
        writer.writerow({'Index': index, 'Attack_ID': attack_id})

print(f"Index to Label mapping has been saved to {csv_file_path}")


In [None]:
label_to_index = {label: index for index, label in index_to_label.items()}
label_to_index

In [None]:
y_all = torch.Tensor(data['attack_id'].replace(label_to_index).to_numpy()).to(int)
y_all

In [None]:
import torch
from sklearn.preprocessing import LabelEncoder
# Data Splitting
x_train, x_test, y_train, y_test = train_test_split(x_tokens, y_all, test_size=0.2, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)

def _load_data(x, y, batch_size=10, device=torch.device("cpu")):
    x_len = len(x)
    for i in range(0, x_len, batch_size):
        slc = slice(i, i + batch_size)
        
        # Convert numpy arrays to tensors and send them to the desired device
        x_tensor = x[slc].clone().detach().to(device)
        y_tensor = torch.tensor(y[slc]).to(device)
        
        yield x_tensor, y_tensor

In [None]:
x_tokens.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)


In [None]:
assert y_train_encoded.max() == len(set(y_train_encoded)) - 1, "Max label should be num_labels - 1"


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification
from tqdm import tqdm
from statistics import mean
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained SecBERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "jackaduma/SecBERT",
    num_labels=len(set(y_train_encoded)),
    output_attentions=False,
    output_hidden_states=False,
)

model = model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
for epoch in range(5):
    epoch_losses = []
    for x, y in tqdm(_load_data(x_train, y_train_encoded, batch_size=10, device=device)):
        model.zero_grad()
        outputs = model(input_ids=x, attention_mask=x.ne(0).to(int), labels=y)
        loss = outputs.loss
        epoch_losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print(f"epoch {epoch} loss: {mean(epoch_losses)}")


In [None]:
# Save the trained model
model_path = './saved_model-name'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)