In [1]:
import pandas as pd
import json
import re
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [2]:
def read_dataset(file_path):
    dataset= pd.read_csv(file_path)
    return dataset

def define_columns_types():
    numeric_columns = [ 'auth_success','auth_attempts']
    categorical_columns=['dest_port', 'status_guess', 'version','kex_alg','mac_alg','host_key_alg', 'cipher_alg', 'client', 'direction']
    return numeric_columns, categorical_columns

def remove_columns(dataset, numeric_columns, categorical_columns):
    for column in dataset.columns:
        if column not in numeric_columns and column not in categorical_columns and column != 'label':
            dataset.drop(column, axis=1, inplace=True)
    return dataset

def replace_null_values(dataset, numeric_columns, categorical_columns):
    numeric_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    dataset[numeric_columns] = numeric_imputer.fit_transform(dataset[numeric_columns])
    dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])
    return dataset

def normalize_numeric_columns(dataset, numeric_columns):
    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
    return dataset

def top_n(dataset, categorical_columns):
    top_val = 10
    for col in categorical_columns:
        value_counts = dataset[col].value_counts()
        top_n_categories = value_counts.index[:top_val].tolist()
        dataset[col] = dataset[col].where(dataset[col].isin(top_n_categories), other='Other')
    return dataset

def categorical_data_encoding(dataset, categorical_columns):
    encoded_dataset = pd.get_dummies(dataset[categorical_columns])
    dataset = dataset.drop(columns=categorical_columns, axis=1)
    dataset = pd.concat([dataset, encoded_dataset], axis=1)
    return dataset

def align_columns(train, test):
   
    missing_in_test = set(train.columns) - set(test.columns)
    missing_in_train = set(test.columns) - set(train.columns) 
  
    for col in missing_in_test: # aggiungi le colonne mancanti in test e riempi con False
        test[col] = False
    
    for col in missing_in_train: # aggiungi le colonne mancanti in train e riempi con False
        train[col] = False

    # riordina le colonne in entrambi i dataset per avere la stessa struttura
    train = train.reindex(sorted(train.columns), axis=1)
    test = test.reindex(sorted(test.columns), axis=1)

    return train, test

def save_test(df, output_prefix):
    
    # Separazione delle feature (X) e delle label (y)
    y = df['label']
    x = df.drop('label', axis=1)

    # Conversione dei dati in float32
    x = x.astype('float32')
    y = y.astype('float32')

    # Salvataggio dei dati in file CSV
    x.to_csv(f'X_{output_prefix}.csv', index=False)
    y.to_csv(f'y_{output_prefix}.csv', index=False)

    return x, y

def save_train(df, output_prefix):
    data_label_0 = df[df['label'] == 0]

    x=data_label_0.drop('label', axis=1)
    
    # Conversione dei dati in float32
    x = x.astype('float32')

    # Salvataggio dei dati in file CSV
    x.to_csv(f'X_{output_prefix}.csv', index=False)

    return x

In [None]:
"""
    Eseguire tutte le funzioni di preprocessing nell'ordine corretto:
    1. Caricare il dataset ->  read_dataset(file_path)
    2. Definire i tipi di colonne -> define_columns_types()
    3. Rimuovere le colonne non necessarie -> remove_columns(dataset, numeric_columns, categorical_columns)
    4. Modificare la colonna 'host' -> modify_host_column(dataset)
    5. Sostituire i valori nulli -> replace_null_values(dataset, numeric_columns, categorical_columns)
    6. Normalizzare le colonne numeriche -> normalize_numeric_columns(dataset, numeric_columns)
    7. Sostituire i valori delle colonne categoriche con i top n valori -> top_n(dataset, categorical_columns)
    8. Codificare le colonne categoriche -> categorical_data_encoding(dataset, categorical_columns)
    9. Allineare le colonne tra train e test -> align_columns(train, test)
    10. Salvare i dati di train -> save_train(df, output_prefix)
    11. Salvare i dati di test -> save_test(df, output_prefix)

    l'ouput è il dataset X, che contiene solo le feature, e y, che contiene solo le label
"""

"""
    Esempio di utilizzo delle funzioni di preprocessing con un dataset di train e un dataset di test:
    
    df_train=read_dataset('train_ssh_row.csv')
    df_test=read_dataset('test_ssh_row.csv')

    numeric_columns, categorical_columns=define_columns_types()

    df_train=remove_columns(df_train, numeric_columns, categorical_columns)
    df_train=modify_host_column(df_train)
    df_train=replace_null_values(df_train, numeric_columns, categorical_columns)
    df_train=normalize_numeric_columns(df_train, numeric_columns)
    df_train=top_n(df_train, categorical_columns)
    df_train=categorical_data_encoding(df_train, categorical_columns)

    df_test=remove_columns(df_test, numeric_columns, categorical_columns)
    df_test=modify_host_column(df_test)
    df_test=replace_null_values(df_test, numeric_columns, categorical_columns)
    df_test=normalize_numeric_columns(df_test, numeric_columns)
    df_test=top_n(df_test, categorical_columns)
    df_test=categorical_data_encoding(df_test, categorical_columns)

    df_train, df_test=align_columns(df_train, df_test)

    save_train(df_train, 'train')
    save_test(df_test, 'test')
"""

In [None]:
df_train=read_dataset('train_ssh_row.csv')
df_test=read_dataset('test_ssh_row.csv')

In [5]:
numeric_columns, categorical_columns=define_columns_types()

df_train=remove_columns(df_train, numeric_columns, categorical_columns)
df_test=remove_columns(df_test, numeric_columns, categorical_columns)


In [6]:
df_train = replace_null_values(df_train, numeric_columns, categorical_columns)
df_train = normalize_numeric_columns(df_train, numeric_columns)

df_test = replace_null_values(df_test, numeric_columns, categorical_columns)
df_test = normalize_numeric_columns(df_test, numeric_columns)

In [8]:
df_train = top_n(df_train, categorical_columns)
df_test = top_n(df_test, categorical_columns)

df_train = categorical_data_encoding(df_train, categorical_columns)
df_test = categorical_data_encoding(df_test, categorical_columns)

In [10]:
df_train, df_test = align_columns(df_train, df_test)

In [None]:
save_train(df_train, 'train')
save_test(df_test, 'test')