In [None]:
import pandas as pd
import json
import re
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [2]:
def load_json(file_path, dataset):
    with open(file_path, 'r') as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

def remove_columns(dataset, numeric_columns, categorical_columns):
    for column in dataset.columns:
        if column not in numeric_columns and column not in categorical_columns and column != 'label':
            dataset.drop(column, axis=1, inplace=True)
    return dataset

def replace_null_values(dataset, numeric_columns, categorical_columns,numeric_imputer, categorical_imputer):
    dataset[numeric_columns] = numeric_imputer.fit_transform(dataset[numeric_columns])
    dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])
    return dataset

def normalize_numeric_columns(dataset, numeric_columns):
    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
    return dataset

def top_n(dataset, categorical_columns):
    top_val = 10
    for col in categorical_columns:
        value_counts = dataset[col].value_counts()
        top_n_categories = value_counts.index[:top_val].tolist()
        dataset[col] = dataset[col].where(dataset[col].isin(top_n_categories), other='Other')
    return dataset

def align_columns(train, test):
   
    missing_in_test = set(train.columns) - set(test.columns)
    missing_in_train = set(test.columns) - set(train.columns) 
  
    for col in missing_in_test: # aggiungi le colonne mancanti in test e riempi con False
        test[col] = False
    
    for col in missing_in_train: # aggiungi le colonne mancanti in train e riempi con False
        train[col] = False

    # riordina le colonne in entrambi i dataset per avere la stessa struttura
    train = train.reindex(sorted(train.columns), axis=1)
    test = test.reindex(sorted(test.columns), axis=1)

    return train, test

def concat_datasets(df1, df2):
    concatenated_df = pd.concat([df1, df2], ignore_index=True)
    return concatenated_df

In [None]:
df_train=pd.read_csv('dataset_train.csv')
df_test=pd.read_csv('dataset_test.csv')

In [None]:
numeric_columns = [ 'auth_success','auth_attempts']
categorical_columns=['dest_port', 'status_guess', 'version','kex_alg','mac_alg','host_key_alg', 'cipher_alg', 'client', 'direction']

df_train=remove_columns(df_train, numeric_columns, categorical_columns)
df_test=remove_columns(df_test, numeric_columns, categorical_columns)


In [None]:
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df_train = replace_null_values(df_train, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
df_train = normalize_numeric_columns(df_train, numeric_columns)

df_test = replace_null_values(df_test, numeric_columns, categorical_columns, numeric_imputer, categorical_imputer)
df_test = normalize_numeric_columns(df_test, numeric_columns)

In [None]:
print(df_train.shape)
print(df_test.shape)

(517898, 12)
(132703, 12)


In [None]:
df_train = top_n(df_train, categorical_columns)
df_test = top_n(df_test, categorical_columns)

# Codifica dei dati categorici con get_dummies di pandas per la codifica one-hot
categorical_encoded = pd.get_dummies(df_train[categorical_columns])
categorical_encoded_test = pd.get_dummies(df_test[categorical_columns])

df_train = df_train.drop(columns=categorical_columns, axis=1) # Rimozione delle colonne categoriche originali
df_train = pd.concat([df_train, categorical_encoded], axis=1) # Concatenazione delle nuove colonne codificate


df_test = df_test.drop(columns=categorical_columns, axis=1)  # Rimozione delle colonne categoriche originali 
df_test = pd.concat([df_test, categorical_encoded_test], axis=1) # Concatenazione delle nuove colonne codificate


In [None]:
print(df_train.shape)
print(df_test.shape)

(517898, 64)
(132703, 60)


In [None]:
df_train, dataset_df_testtest = align_columns(df_train, df_test)

In [None]:
print(df_train.shape)
print(df_test.shape)

(517898, 69)
(132703, 69)


In [14]:
#ESEMPIO FUNZIONAMENTO align_columns
# Dataset di train (ha le colonne 'A', 'B', 'C')
data_train = {'A': [1, 2, 3],
              'B': [4, 5, 6],
              'C': [7, 8, 9]}

train = pd.DataFrame(data_train)

# Dataset di test (ha le colonne 'B', 'C', 'D')
data_test = {'B': [10, 11, 12],
             'C': [13, 14, 15],
             'D': [16, 17, 18]}

test = pd.DataFrame(data_test)

print("\ntrain:")
print(train)
print("\ntest:")
print(test)

train, test = align_columns(train, test)

print("\ntrain modificato:")
print(train)
print("\ntest modificato:")
print(test)


train:
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

test:
    B   C   D
0  10  13  16
1  11  14  17
2  12  15  18

train modificato:
   A  B  C      D
0  1  4  7  False
1  2  5  8  False
2  3  6  9  False

test modificato:
       A   B   C   D
0  False  10  13  16
1  False  11  14  17
2  False  12  15  18


In [None]:
data_label_0 = df_train[df_train['label'] == 0]
data_label_1 = df_train[df_train['label'] != 0]
X_train=data_label_0.drop('label', axis=1)

In [None]:
y_test = df_test['label']
X_test = df_test.drop('label', axis=1)

In [14]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')


In [None]:
X_train_df = pd.DataFrame(X_train)
X_train_df.to_csv('X_train.csv', index=False)

X_test_df = pd.DataFrame(X_test)
X_test_df.to_csv('X_test.csv', index=False)

y_test_df = pd.DataFrame(y_test)
y_test_df.to_csv('y_test.csv', index=False)
