In [134]:
'''
Author: Damiano Pasquini
email: damiano23@ru.is
'''

import os
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import StratifiedShuffleSplit

path = "./MachineLearningCVE"

In [135]:

def preprocess_data(data_frame):
    # Preprocess the columns
    # TODO: infinite values and NaN values are meaningful so they must be handled differently and not dropped
    data_frame.rename(columns=lambda x: x.strip(), inplace=True)  # Remove leading/trailing spaces from column names
    data_frame["Label"] = data_frame["Label"].apply(lambda x: x.strip())
    # change all values of Label column to either "DoS", "Scan", "Benign", or "Exploit"
    #TODO: fix this
    data_frame.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace inf values with NaN
    data_frame.dropna(inplace=True) # Drop rows with NaN values
    data_frame["Label"] = data_frame["Label"].apply(preprocess_label_column()) #TODO: add label
    return data_frame

In [136]:
def preprocess_label_column(label):
    # transform the content of the column "Label". If the value is "BENIGN" then it is changed to "Benign", if it contains "DoS" then it is changed to "DoS", if it contains "PortScan" then it is changed to "Scan", otherwise it is changed to "Exploit"
    if "BENIGN" in label:
        return "Benign"
    elif "DoS" in label:
        return "DoS"
    elif "PortScan" in label:
        return "Scan"
    else:
        return "Exploit"

In [137]:

def merge_csv_files(path_to_csv_files, index=True, write_to_file=False):
    csv_files = [f for f in os.listdir(path_to_csv_files) if f.endswith('.csv')]
    dfs = [pd.read_csv(os.path.join(path_to_csv_files, f)) for f in csv_files]
    df = pd.concat(dfs, ignore_index=True)
    if write_to_file:
        df.to_csv("./combined.csv", index=index)
    return df

In [138]:

def train_test_by_days(path_to_files, df):
    d_train = pd.DataFrame()
    d_test = pd.DataFrame()
    for file in os.listdir(path_to_files):
        if file.endswith(".csv"):
            first_word = file.split()[0].strip()
            if first_word in ["Monday", "Tuesday", "Wednesday"]:
                d_train = pd.concat([d_train, df])
            elif first_word in ["Thursday", "Friday"]:
                d_test = pd.concat([d_test, df])
    return d_train, d_test

In [139]:

def get_dataset(path_to_files, splitmode=None):
    """
    Function to obtain the train dataset and the test dataset given the path where the csv files are,
    and the percentage of training dataset
    :param path_to_files: directory where the csv files are located 
    :param splitmode: number between 0 and 1, if specified indicates the percentage of the train set (d_train)  
    :return: d_train and d_test, relatively training dataset and testing dataset
    """
    # Ensure splitmode is within the valid range
    if splitmode is not None and (splitmode < 0 or splitmode > 1):
        raise ValueError("splitmode should be between 0 and 1")
    
    df = merge_csv_files(path_to_files)
    d_train = pd.DataFrame()
    d_test = pd.DataFrame()
    # Preprocess the columns of the combined dataframe
    preprocess_data(df)
    if splitmode is not None:
        df = sklearn.utils.shuffle(df) # Shuffle the rows
        # Split the processed data into two distinct Pandas data frames: one for the training set and one for the testing set.
        split_index = int(len(df) * splitmode)
        d_train = pd.concat([d_train, df[:split_index]])
        d_test = pd.concat([d_test, df[split_index:]])
    else:
        # TODO: not working: error -> KeyError: 'Label'
        d_train, d_test = train_test_by_days(path_to_files, df)
    return d_train, d_test

In [140]:
def split_x_y(data):
    """
    Split the dataset in data and labels (relatively X and y)
    :param data: dataset
    :return: tuple X and y, relatively data and labels for the given dataset
    """
    X = data.drop(columns=["Label"])
    y = data["Label"] # TODO: not working: error -> KeyError: 'Label' when running with splitmode
    return X, y

In [141]:
'''
Using scikit-learn, train a decision tree classifier on the training set.
Test the model firs with splitmode = 0.6 and then without splitmode.
'''

print("Creating train and test sets...")
train_data, test_data = get_dataset(path, splitmode=0.6)
train_data.to_csv("train_data.csv")
test_data.to_csv("test_data.csv")
X_train, y_train = split_x_y(train_data)
X_test, y_test = split_x_y(test_data)

print("Training the decision tree classifier...")
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.predict(X_train)
print("Accuracy on training set: ", clf.score(X_train, y_train))
print("Accuracy on testing set: ", clf.score(X_test, y_test))

Creating train and test sets...


AttributeError: 'Benign' is not a valid function for 'Series' object