In [1]:
import pandas as pd
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import pad_sequences
from joblib import dump

In [2]:
def is_ip_address(value):# Funzione per verificare se un valore è un indirizzo IP
    ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
    return bool(ip_pattern.match(value))

def modify_host_column(dataset):# Modifica i valori della colonna 'host' in 1 se è un indirizzo IP, 0 altrimenti
    val=dataset['host'].astype(str)
    dataset['host'] = val.apply(lambda x: 1 if is_ip_address(x) else 0)
    return dataset

def remove_columns(dataset, numeric_columns, categorical_columns):
    for column in dataset.columns:
        if column not in numeric_columns and column not in categorical_columns and column != 'label' and column != '@timestamp':
            dataset.drop(column, axis=1, inplace=True)
    return dataset

def replace_null_values(dataset, numeric_columns, categorical_columns,numeric_imputer, categorical_imputer):
    dataset[numeric_columns] = numeric_imputer.fit_transform(dataset[numeric_columns])
    dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])
    return dataset

def normalize_numeric_columns(dataset, numeric_columns):
    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
    return dataset

def top_n(dataset, categorical_columns):
    top_val = 10
    for col in categorical_columns:
        if col!= 'status_code': #considero tutti i possibili status_code
            value_counts = dataset[col].value_counts()
            top_n_categories = value_counts.index[:top_val].tolist()
            dataset[col] = dataset[col].where(dataset[col].isin(top_n_categories), other='Other')
    return dataset

def align_columns(train, test): #funzione per dare lo stesso scheletro ai dataset di train e test
   
    missing_in_test = set(train.columns) - set(test.columns)
    missing_in_train = set(test.columns) - set(train.columns) 
  
    for col in missing_in_test: # aggiungi le colonne mancanti in test e riempi con False
        test[col] = False
    
    for col in missing_in_train: # aggiungi le colonne mancanti in train e riempi con False
        train[col] = False

    # riordina le colonne in entrambi i dataset per avere la stessa struttura
    train = train.reindex(sorted(train.columns), axis=1)
    test = test.reindex(sorted(test.columns), axis=1)

    return train, test

def add_padding(max_len,data):
    data=pad_sequences(data, maxlen=max_len, padding='post', value=0.0, dtype='float32')
    return data

def create_window(df, window_size_records, label_column): 
    data = []
    labels = []

    df = df.sort_values('@timestamp')

    # Scorre il dataframe creando finestre di dimensione fissa
    for i in range(0, len(df), window_size_records):
        window_records = df.iloc[i:i + window_size_records]
        
        if not window_records.empty:

            window_records = window_records.drop(columns=['@timestamp'])
            window_records=window_records.astype(float)
            
            labels.append(window_records[label_column].values)

            window_records = window_records.drop(columns=[label_column])
            
            data.append(window_records)

    return data, labels

In [None]:
dataset=pd.read_csv('dataset.csv')
dataset_test=pd.read_csv('dataset_test.csv')

In [104]:
numeric_columns = ['request_body_len', 'trans_depth', 'response_body_len','host']
categorical_columns=['dest_port', 'method', 'version', 'status_code', 'response_content_type', 'request_content_type']

dataset=remove_columns(dataset, numeric_columns, categorical_columns)
dataset=modify_host_column(dataset)

dataset_test=remove_columns(dataset_test, numeric_columns, categorical_columns)
dataset_test=modify_host_column(dataset_test)


In [105]:
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

dataset = replace_null_values(dataset, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
dataset = normalize_numeric_columns(dataset, numeric_columns)

dataset_test = replace_null_values(dataset_test, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
dataset_test = normalize_numeric_columns(dataset_test, numeric_columns)


In [106]:
dataset = top_n(dataset, categorical_columns)
dataset_test = top_n(dataset_test, categorical_columns)

# Codifica one-hot dei dai categorici
categorical_encoded = pd.get_dummies(dataset[categorical_columns])
categorical_encoded_test = pd.get_dummies(dataset_test[categorical_columns])

dataset = dataset.drop(columns=categorical_columns, axis=1) # Rimozione delle colonne categoriche originali
dataset = pd.concat([dataset, categorical_encoded], axis=1) # Concatenazione delle nuove colonne codificate


dataset_test = dataset_test.drop(columns=categorical_columns, axis=1)  # Rimozione delle colonne categoriche originali 
dataset_test = pd.concat([dataset_test, categorical_encoded_test], axis=1) # Concatenazione delle nuove colonne codificate

In [None]:
dataset, dataset_test = align_columns(dataset, dataset_test)

print(dataset_test.head())
print(dataset.head())
print(dataset.columns.to_list())
print(dataset_test.columns.to_list())

In [3]:
#ESEMPIO FUNZIONAMENTO align_columns
# Dataset di train (ha le colonne 'A', 'B', 'C')
data_train = {'A': [1, 2, 3],
              'B': [4, 5, 6],
              'C': [7, 8, 9]}

train = pd.DataFrame(data_train)

# Dataset di test (ha le colonne 'B', 'C', 'D')
data_test = {'B': [10, 11, 12],
             'C': [13, 14, 15],
             'D': [16, 17, 18]}

test = pd.DataFrame(data_test)

print("\ntrain:")
print(train)
print("\ntest:")
print(test)

train, test = align_columns(train, test)

print("\ntrain modificato:")
print(train)
print("\ntest modificato:")
print(test)


train:
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

test:
    B   C   D
0  10  13  16
1  11  14  17
2  12  15  18

train modificato:
   A  B  C      D
0  1  4  7  False
1  2  5  8  False
2  3  6  9  False

test modificato:
       A   B   C   D
0  False  10  13  16
1  False  11  14  17
2  False  12  15  18


In [109]:
data_label_0 = dataset[dataset['label'] == 0]

In [111]:
window_size_seconds = 600
label_column = 'label'

X_train, y_train = create_window(data_label_0, window_size_seconds, label_column)
X_test, y_test= create_window(dataset_test, window_size_seconds, label_column)

In [None]:
X_train_padded=add_padding(window_size_seconds,X_train)
X_test_padded=add_padding(window_size_seconds,X_test)
y_test_padded=add_padding(window_size_seconds,y_test)

In [None]:
dump(y_test_padded, 'y_test.joblib')
dump(X_test_padded, 'X_test.joblib')
dump(X_train_padded, 'X_train.joblib')