In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import json
import re

In [8]:
def load_json(file_path, dataset):
    with open(file_path, 'r') as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

def is_ip_address(value):# Funzione per verificare se un valore è un indirizzo IP
    ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
    return bool(ip_pattern.match(value))

def modify_host_column(dataset):# Modifica i valori della colonna 'host' in 1 se è un indirizzo IP, 0 altrimenti
    val=dataset['host'].astype(str)
    dataset['host'] = val.apply(lambda x: 1 if is_ip_address(x) else 0)
    return dataset

def remove_columns(dataset, numeric_columns, categorical_columns):
    for column in dataset.columns:
        if column not in numeric_columns and column not in categorical_columns and column != 'label':
            dataset.drop(column, axis=1, inplace=True)
    return dataset

def replace_null_values(dataset, numeric_columns, categorical_columns,numeric_imputer, categorical_imputer):
    dataset[numeric_columns] = numeric_imputer.fit_transform(dataset[numeric_columns])
    dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])
    return dataset

def normalize_numeric_columns(dataset, numeric_columns):
    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
    return dataset

def top_n(dataset, categorical_columns):
    top_val = 10
    for col in categorical_columns:
        value_counts = dataset[col].value_counts()
        top_n_categories = value_counts.index[:top_val].tolist()
        dataset[col] = dataset[col].where(dataset[col].isin(top_n_categories), other='Other')
    return dataset

In [3]:
train_file_path = 'dataset_train.json'
dataset = []

dataset=load_json(train_file_path, dataset)

dataset = pd.DataFrame(dataset)
print(dataset.head())

   status_code   method  request_body_len @type                          uri  \
0          200  CONNECT               0.0  http                   /sdkTunnel   
1          304      GET               0.0  http                            /   
2          200      GET               0.0  http                     /metrics   
3          200  CONNECT               0.0  http                   /sdkTunnel   
4          200      GET               0.0  http  /realtime/index.php?lang=en   

  version                                          source_ip  \
0     1.1  5af08578006f6d82c78bc5fd1f149e62d2b5fefd5ed99d...   
1     1.1  7c117019b8489d2f6017fbadaf89d0f70dbf02d50d0fd8...   
2     1.1  28f33c01fc50aead587f431b10723df3ed7cd7564921b2...   
3     1.1  5af08578006f6d82c78bc5fd1f149e62d2b5fefd5ed99d...   
4     1.1  fe7560f664a25aef610273189411c39779d6b926da6338...   

                    uid  trans_depth     request_content_type  ... orig_fuids  \
0  CWcf913k0wd2CmwaU900            1  text/xml; chars

In [4]:
test_file_path='dataset_test.json'
dataset_test = []

dataset_test = load_json(test_file_path, dataset_test)

dataset_test = pd.DataFrame(dataset_test)
print(dataset_test.head())

   status_code method  request_body_len @type  \
0          101    GET                 0  http   
1          200    GET                 0  http   
2          307   POST               565  http   
3          200   POST               565  http   
4          200    GET                 0  http   

                                                 uri version  \
0  /socket.io/?associate=837cc430-bf0a-4477-a4c2-...     1.1   
1           /cost/documents/privacy.php?d=1877551200     1.1   
2  /webhdfs/v1/user/hdp/Data/smart_homes/apio/tes...     1.1   
3  /webhdfs/v1/user/hdp/Data/smart_homes/apio/tes...     1.1   
4  /webhdfs/v1/user/hdp/Data/smart_homes/apio/tes...     1.1   

                                           source_ip             info_msg  \
0  cca5d72e4cd19d47753ebbd685675716b3f8f3d1cc6364...  Switching Protocols   
1  4ad9234dae1ee8f5acf4b6c8921643b26bc354d10d3462...                  NaN   
2  d6980d33e07148a195c197deffa2577ac08cde23fc27d4...                  NaN   
3  d6980d33e

In [5]:
numeric_columns = ['request_body_len', 'trans_depth', 'response_body_len','host']
categorical_columns=['dest_port', 'method', 'version', 'status_code', 'response_content_type', 'request_content_type']

dataset=remove_columns(dataset, numeric_columns, categorical_columns)
dataset=modify_host_column(dataset)

dataset_test=remove_columns(dataset_test, numeric_columns, categorical_columns)
dataset_test=modify_host_column(dataset_test)

In [6]:
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

dataset = replace_null_values(dataset, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
dataset = normalize_numeric_columns(dataset, numeric_columns)

dataset_test = replace_null_values(dataset_test, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
dataset_test = normalize_numeric_columns(dataset_test, numeric_columns)


In [9]:
dataset = top_n(dataset, categorical_columns)
dataset_test = top_n(dataset_test, categorical_columns)

# Codifica one-hot dei dai categorici
categorical_encoded = pd.get_dummies(dataset[categorical_columns])
categorical_encoded_test = pd.get_dummies(dataset_test[categorical_columns])

dataset = dataset.drop(columns=categorical_columns, axis=1) # Rimozione delle colonne categoriche originali
dataset = pd.concat([dataset, categorical_encoded], axis=1) # Concatenazione delle nuove colonne codificate


dataset_test = dataset_test.drop(columns=categorical_columns, axis=1)  # Rimozione delle colonne categoriche originali 
dataset_test = pd.concat([dataset_test, categorical_encoded_test], axis=1) # Concatenazione delle nuove colonne codificate


In [10]:
print("\nAnteprima dopo la codifica one-hot:")
print(dataset.head())
print(dataset_test.head())


Anteprima dopo la codifica one-hot:
   request_body_len  trans_depth  host  response_body_len  label  \
0               0.0          0.0   1.0           0.000000      0   
1               0.0          0.0   0.0           0.000000      0   
2               0.0          0.0   1.0           0.000009      0   
3               0.0          0.0   1.0           0.000000      0   
4               0.0          0.0   0.0           0.000008      0   

   dest_port_80  dest_port_3000  dest_port_7000  dest_port_7001  \
0          True           False           False           False   
1          True           False           False           False   
2         False            True           False           False   
3          True           False           False           False   
4          True           False           False           False   

   dest_port_7003  ...  request_content_type_application/json  \
0           False  ...                                  False   
1           False  ..

In [11]:
data_label_0 = dataset[dataset['label'] == 0]
data_label_1 = dataset[dataset['label'] != 0]
y_train=data_label_0['label']
X_train=data_label_0.drop('label', axis=1)

In [12]:
y_test = dataset_test['label']
X_test = dataset_test.drop('label', axis=1)

In [13]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

In [14]:
output_path = 'dati/X_train.json'
X_train.to_json(output_path, orient='records', lines=True)

output_path = 'dati/X_test.json'
X_test.to_json(output_path, orient='records', lines=True)

output_path = 'dati/y_test.json'
y_test.to_json(output_path, orient='records', lines=True)

output_path = 'dati/y_train.json'
y_train.to_json(output_path, orient='records', lines=True)

In [16]:
X_train_df = pd.DataFrame(X_train)
X_train_df.to_csv('dati/X_train.csv', index=False)
print(X_train_df.head())

X_test_df = pd.DataFrame(X_test)
X_test_df.to_csv('dati/X_test.csv', index=False)
print(X_test_df.head())

y_test_df = pd.DataFrame(y_test)
y_test_df.to_csv('dati/y_test.csv', index=False)
print(y_test_df.head())

y_train_df = pd.DataFrame(y_train)
y_train_df.to_csv('dati/y_train.csv', index=False)
print(y_train_df.head())

   request_body_len  trans_depth  host  response_body_len  dest_port_80  \
0               0.0          0.0   1.0           0.000000           1.0   
1               0.0          0.0   0.0           0.000000           1.0   
2               0.0          0.0   1.0           0.000009           0.0   
3               0.0          0.0   1.0           0.000000           1.0   
4               0.0          0.0   0.0           0.000008           1.0   

   dest_port_3000  dest_port_7000  dest_port_7001  dest_port_7003  \
0             0.0             0.0             0.0             0.0   
1             0.0             0.0             0.0             0.0   
2             1.0             0.0             0.0             0.0   
3             0.0             0.0             0.0             0.0   
4             0.0             0.0             0.0             0.0   

   dest_port_8002  ...  request_content_type_application/json  \
0             0.0  ...                                    0.0   
1   