In [None]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [49]:
def load_json(file_path, dataset):
    with open(file_path, 'r') as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

def is_ip_address(value):# Funzione per verificare se un valore è un indirizzo IP
    ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
    return bool(ip_pattern.match(value))

def modify_host_column(dataset):# Modifica i valori della colonna 'host' in 1 se è un indirizzo IP, 0 altrimenti
    val=dataset['host'].astype(str)
    dataset['host'] = val.apply(lambda x: 1 if is_ip_address(x) else 0)
    return dataset

def remove_columns(dataset, numeric_columns, categorical_columns):
    for column in dataset.columns:
        if column not in numeric_columns and column not in categorical_columns and column != 'label':
            dataset.drop(column, axis=1, inplace=True)
    return dataset

def replace_null_values(dataset, numeric_columns, categorical_columns,numeric_imputer, categorical_imputer):
    dataset[numeric_columns] = numeric_imputer.fit_transform(dataset[numeric_columns])
    dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])
    return dataset

def normalize_numeric_columns(dataset, numeric_columns):
    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
    return dataset

def top_n(dataset, categorical_columns):
    top_val = 10
    for col in categorical_columns:
        value_counts = dataset[col].value_counts()
        top_n_categories = value_counts.index[:top_val].tolist()
        dataset[col] = dataset[col].where(dataset[col].isin(top_n_categories), other='Other')
    return dataset

def align_columns(train, test):
   
    missing_in_test = set(train.columns) - set(test.columns)
    missing_in_train = set(test.columns) - set(train.columns) 
  
    for col in missing_in_test: # aggiungi le colonne mancanti in test e riempi con False
        test[col] = False
    
    for col in missing_in_train: # aggiungi le colonne mancanti in train e riempi con False
        train[col] = False

    # riordina le colonne in entrambi i dataset per avere la stessa struttura
    train = train.reindex(sorted(train.columns), axis=1)
    test = test.reindex(sorted(test.columns), axis=1)

    return train, test

In [None]:
df_train=pd.read_csv('dataset_train.csv')
df_test_19=pd.read_csv('dataset_test_19.csv')
df_test_04=pd.read_csv('dataset_test_04.csv')

In [None]:
numeric_columns = ['request_body_len', 'trans_depth', 'response_body_len','host']
categorical_columns=['dest_port', 'method', 'version', 'status_code', 'response_content_type', 'request_content_type']

df_train=remove_columns(df_train, numeric_columns, categorical_columns)
df_train=modify_host_column(df_train)

df_test_19=remove_columns(df_test_19, numeric_columns, categorical_columns)
df_test_19=modify_host_column(df_test_19)

df_test_04=remove_columns(df_test_04, numeric_columns, categorical_columns)
df_test_04=modify_host_column(df_test_04)

In [None]:
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df_train = replace_null_values(df_train, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
df_train = normalize_numeric_columns(df_train, numeric_columns)

df_test_19 = replace_null_values(df_test_19, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
df_test_19 = normalize_numeric_columns(df_test_19, numeric_columns)

df_test_04 = replace_null_values(df_test_04, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
df_test_04 = normalize_numeric_columns(df_test_04, numeric_columns)


In [None]:
print(df_train.shape)
print(df_test_19.shape)
print(df_test_04.shape)

(1132966, 11)
(932294, 11)
(1288025, 11)


In [None]:
df_train = top_n(df_train, categorical_columns)
df_test_19 = top_n(df_test_19, categorical_columns)
df_test_04 = top_n(df_test_04, categorical_columns)

# Codifica dei dati categorici con get_dummies di pandas per la codifica one-hot
categorical_encoded_train = pd.get_dummies(df_train[categorical_columns])
categorical_encoded_test_19 = pd.get_dummies(df_test_19[categorical_columns])
categorical_encoded_test_04 = pd.get_dummies(df_test_04[categorical_columns])

df_train = df_train.drop(columns=categorical_columns, axis=1) # Rimozione delle colonne categoriche originali
df_train = pd.concat([df_train, categorical_encoded_train], axis=1) # Concatenazione delle nuove colonne codificate


df_test_19 = df_test_19.drop(columns=categorical_columns, axis=1)  # Rimozione delle colonne categoriche originali 
df_test_19 = pd.concat([df_test_19, categorical_encoded_test_19], axis=1) # Concatenazione delle nuove colonne codificate

df_test_04 = df_test_04.drop(columns=categorical_columns, axis=1)  # Rimozione delle colonne categoriche originali
df_test_04 = pd.concat([df_test_04, categorical_encoded_test_04], axis=1) # Concatenazione delle nuove colonne codificate


In [None]:
print(df_train.shape)
print(df_test_19.shape)
print(df_test_04.shape)

(1132966, 63)
(932294, 63)
(1288025, 63)


In [None]:
df_train, df_test_19 = align_columns(df_train, df_test_19)

df_train, df_test_04 = align_columns(df_train, df_test_04)

df_test_19, df_test_04 = align_columns(df_test_19, df_test_04)

In [None]:
print(df_train.shape)
print(df_test_19.shape)
print(df_test_04.shape)

(1132966, 73)
(932294, 73)
(1288025, 73)


In [14]:
#ESEMPIO FUNZIONAMENTO align_columns
# Dataset di train (ha le colonne 'A', 'B', 'C')
data_train = {'A': [1, 2, 3],
              'B': [4, 5, 6],
              'C': [7, 8, 9]}

train = pd.DataFrame(data_train)

# Dataset di test (ha le colonne 'B', 'C', 'D')
data_test = {'B': [10, 11, 12],
             'C': [13, 14, 15],
             'D': [16, 17, 18]}

test = pd.DataFrame(data_test)

print("\ntrain:")
print(train)
print("\ntest:")
print(test)

train, test = align_columns(train, test)

print("\ntrain modificato:")
print(train)
print("\ntest modificato:")
print(test)


train:
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

test:
    B   C   D
0  10  13  16
1  11  14  17
2  12  15  18

train modificato:
   A  B  C      D
0  1  4  7  False
1  2  5  8  False
2  3  6  9  False

test modificato:
       A   B   C   D
0  False  10  13  16
1  False  11  14  17
2  False  12  15  18


In [None]:
data_label_0 = df_train[df_train['label'] == 0]
data_label_1 = df_train[df_train['label'] != 0]
X_train=data_label_0.drop('label', axis=1)

In [None]:
y_test_19 = df_test_19['label']
X_test_19 = df_test_19.drop('label', axis=1)

y_test_04 = df_test_04['label']
X_test_04 = df_test_04.drop('label', axis=1)

In [None]:
X_train = X_train.astype('float32')

X_test_19 = X_test_19.astype('float32')
y_test_19 = y_test_19.astype('float32')

X_test_04 = X_test_04.astype('float32')
y_test_04 = y_test_04.astype('float32')

In [None]:
X_train_df = pd.DataFrame(X_train)
X_train_df.to_csv('X_train.csv', index=False)

X_test_df_19 = pd.DataFrame(X_test_19)
X_test_df_19.to_csv('X_test_19.csv', index=False)

y_test_df_19 = pd.DataFrame(X_test_19)
y_test_df_19.to_csv('X_test_19.csv', index=False)

X_test_df_04 = pd.DataFrame(X_test_04)
X_test_df_04.to_csv('X_test_04.csv', index=False)

y_test_df_04 = pd.DataFrame(y_test_04)
y_test_df_04.to_csv('y_test_04.csv', index=False)