In [10]:
import os
import glob
import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [11]:
def init_dataset(file_extension):
    os.path.dirname=
    all_filenames = [i for i in glob.glob(f"*{file_extension}")]
    dataset = pd.concat([pd.read_csv(f, delimiter=',', encoding='UTF-8') for f in all_filenames], ignore_index=True, verify_integrity=True)
    dataset.drop_duplicates(keep=False, inplace=True)
    dataset["Timestamp"] = pd.to_datetime(dataset["Timestamp"])
    dataset.sort_values(by=['Timestamp'], inplace=True)
    dataset = dataset.drop(["Flow ID", "Src IP", "Src Port", "Dst IP", 'Timestamp', 'Bwd PSH Flags', 'Bwd URG Flags'], axis=1)
    dataset = dataset.replace([np.inf, -np.inf], np.nan)
    return(dataset)

In [12]:
def norm_dataset(dataset):
    dataset.iloc[:,15:17] = dataset.iloc[:,15:17].replace('inf', 'nan')
    dataset.iloc[:,15:17] = dataset.iloc[:,15:17].replace('Infinity', 'nan')
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit(X[:,15:17])
    X[:, 15:17] = imputer.transform(X[:, 15:17]) 
    y = y.reshape(-1,1)
    onehotencoder1 = OneHotEncoder()
    y = onehotencoder1.fit_transform(y).toarray()
    return(X, y)

In [13]:
def split_dataset(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1) 
    #Feature Scaling
    x_scaler = MinMaxScaler(feature_range = (0,1))
    y_scaler = MinMaxScaler(feature_range = (0,1))
    y_train = np.reshape(y_train, (y_train.shape[0], 1)) 
    y_test = np.reshape(y_test, (y_test.shape[0], 1))
    y_val = np.reshape(y_val, (y_val.shape[0], 1))
    X_train_scaled = x_scaler.fit_transform(X_train)
    y_train_scaled = y_scaler.fit_transform(y_train)
    X_train_scaled = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
    X_test_scaled = x_scaler.transform(X_test)
    y_test_scaled = y_scaler.transform(y_test)
    X_test_scaled = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))
    X_val_scaled = x_scaler.transform(X_val)
    y_val_scaled = y_scaler.transform(y_val)
    X_val_scaled = np.reshape(X_val_scaled, (X_val_scaled.shape[0], X_val_scaled.shape[1], 1))
    return(X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, X_val_scaled, y_val_scaled)

In [None]:
if __name__=="__main__": 
    dataset = init_dataset(file_extension='CICFlowMeter.csv')
    X, y = norm_dataset(dataset)
    X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, X_val_scaled, y_val_scaled = split_dataset(X, y)