In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import json
import re

In [8]:
def load_json(file_path, dataset):
    with open(file_path, 'r') as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

def is_ip_address(value):# Funzione per verificare se un valore è un indirizzo IP
    ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
    return bool(ip_pattern.match(value))

def modify_host_column(dataset):# Modifica i valori della colonna 'host' in 1 se è un indirizzo IP, 0 altrimenti
    val=dataset['host'].astype(str)
    dataset['host'] = val.apply(lambda x: 1 if is_ip_address(x) else 0)
    return dataset

def remove_columns(dataset, numeric_columns, categorical_columns):
    for column in dataset.columns:
        if column not in numeric_columns and column not in categorical_columns and column != 'label':
            dataset.drop(column, axis=1, inplace=True)
    return dataset

def replace_null_values(dataset, numeric_columns, categorical_columns,numeric_imputer, categorical_imputer):
    dataset[numeric_columns] = numeric_imputer.fit_transform(dataset[numeric_columns])
    dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])
    return dataset

def normalize_numeric_columns(dataset, numeric_columns):
    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
    return dataset

def top_n(dataset, categorical_columns):
    top_val = 10
    for col in categorical_columns:
        value_counts = dataset[col].value_counts()
        top_n_categories = value_counts.index[:top_val].tolist()
        dataset[col] = dataset[col].where(dataset[col].isin(top_n_categories), other='Other')
    return dataset

In [None]:
train_file_path = 'dataset_train.json'
dataset = []

dataset=load_json(train_file_path, dataset)

dataset = pd.DataFrame(dataset)

In [None]:
test_file_path='dataset_test.json'
dataset_test = []

dataset_test = load_json(test_file_path, dataset_test)

dataset_test = pd.DataFrame(dataset_test)

In [5]:
numeric_columns = ['request_body_len', 'trans_depth', 'response_body_len','host']
categorical_columns=['dest_port', 'method', 'version', 'status_code', 'response_content_type', 'request_content_type']

dataset=remove_columns(dataset, numeric_columns, categorical_columns)
dataset=modify_host_column(dataset)

dataset_test=remove_columns(dataset_test, numeric_columns, categorical_columns)
dataset_test=modify_host_column(dataset_test)

In [6]:
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

dataset = replace_null_values(dataset, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
dataset = normalize_numeric_columns(dataset, numeric_columns)

dataset_test = replace_null_values(dataset_test, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
dataset_test = normalize_numeric_columns(dataset_test, numeric_columns)


In [9]:
dataset = top_n(dataset, categorical_columns)
dataset_test = top_n(dataset_test, categorical_columns)

# Codifica one-hot dei dai categorici
categorical_encoded = pd.get_dummies(dataset[categorical_columns])
categorical_encoded_test = pd.get_dummies(dataset_test[categorical_columns])

dataset = dataset.drop(columns=categorical_columns, axis=1) # Rimozione delle colonne categoriche originali
dataset = pd.concat([dataset, categorical_encoded], axis=1) # Concatenazione delle nuove colonne codificate


dataset_test = dataset_test.drop(columns=categorical_columns, axis=1)  # Rimozione delle colonne categoriche originali 
dataset_test = pd.concat([dataset_test, categorical_encoded_test], axis=1) # Concatenazione delle nuove colonne codificate


In [11]:
data_label_0 = dataset[dataset['label'] == 0]
data_label_1 = dataset[dataset['label'] != 0]
y_train=data_label_0['label']
X_train=data_label_0.drop('label', axis=1)

In [12]:
y_test = dataset_test['label']
X_test = dataset_test.drop('label', axis=1)

In [13]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

In [14]:
output_path = 'dati/X_train.json'
X_train.to_json(output_path, orient='records', lines=True)

output_path = 'dati/X_test.json'
X_test.to_json(output_path, orient='records', lines=True)

output_path = 'dati/y_test.json'
y_test.to_json(output_path, orient='records', lines=True)

output_path = 'dati/y_train.json'
y_train.to_json(output_path, orient='records', lines=True)

In [None]:
X_train_df = pd.DataFrame(X_train)
X_train_df.to_csv('dati/X_train.csv', index=False)
print(X_train_df.head())

X_test_df = pd.DataFrame(X_test)
X_test_df.to_csv('dati/X_test.csv', index=False)
print(X_test_df.head())

y_test_df = pd.DataFrame(y_test)
y_test_df.to_csv('dati/y_test.csv', index=False)
print(y_test_df.head())

y_train_df = pd.DataFrame(y_train)
y_train_df.to_csv('dati/y_train.csv', index=False)
print(y_train_df.head())