In [None]:
import csv, os
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier

path_to_files = "./MachineLearningCVE"

'''
This function takes in input the path to the folder containing the csv files and "splitmode" which is a value
between 0 and 1 that indicates the percentage of the dataset that will be used for training. Both the training and the
test set will be returned as randomized pandas dataframes.
Preprocess the data by assigning to the "Label" column the value "DoS" if the value is “DoS” or “DDos”, assigning "Scan"
if the value is "PortScan" and "Exploit" for the remaining values.
Split the processed data into two distinct Pandas data frames: one for the training set and one for the testing set, 
according to the value of the "splitmode" parameter.
Return these two data frames as "d_train" and "d_test".
Call the function get_dataset(), the first parameter "path_to_files" and the second parameter "splitmode".
Create a function to obtain X_train, X_test, y_train, y_test from d_train and d_test.
'''

def preprocess_label(label):
    if label in ["DoS", "DDos"]:
        return "DoS"
    elif label == "PortScan":
        return "Scan"
    else:
        return "Exploit"

def get_dataset(path_to_files, splitmode=None):
    # Ensure splitmode is within the valid range
    if splitmode is not None and (splitmode < 0 or splitmode > 1):
        raise ValueError("splitmode should be between 0 and 1")

    # List all CSV files in the folder
    csv_files = [f for f in os.listdir(path_to_files) if f.endswith(".csv")]

    # Initialize empty dataframes for training and testing
    d_train = pd.DataFrame()
    d_test = pd.DataFrame()

    # Read and preprocess each CSV file
    for file in csv_files:
        file_path = os.path.join(path_to_files, file)
        df = pd.read_csv(file_path)
        df.rename(columns={" Label": "Label"}, inplace=True)
        # Preprocess the "Label" column
        df["Label"] = df["Label"].apply(preprocess_label)
        if splitmode is not None:            
            # Randomly split the data into training and testing
            if random.random() < splitmode:
                d_train = pd.concat([d_train, df])
            else:
                d_test = pd.concat([d_test, df])
        else:
            # Split based on the first word of the CSV file name
            first_word = file.split()[0].strip()
            if first_word in ["Monday", "Tuesday", "Wednesday"]:
                d_train = pd.concat([d_train, df])
            elif first_word in ["Thursday", "Friday"]:
                d_test = pd.concat([d_test, df])
    return d_train, d_test

def get_X_y(d):
    X = d.drop(columns=["Label"])
    y = d["Label"]
    return X, y 

In [None]:
'''
Using scikit-learn, train a decision tree classifier on the training set.
Test the model firs with splitmode = 0.6 and then without splitmode.
'''

#TODO: fix the problem with float32
d_train, d_test = get_dataset(path_to_files, splitmode=0.6)
X_train, y_train = get_X_y(d_train)
X_test, y_test = get_X_y(d_test)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print("Accuracy on training set: ", clf.score(X_train, y_train))

d_train_2, d_test_2 = get_dataset(path_to_files)
X_train_2, y_train_2 = get_X_y(d_train_2)
X_test_2, y_test_2 = get_X_y(d_test_2)

clf_2 = DecisionTreeClassifier()
clf_2.fit(X_train_2, y_train_2)
print("Accuracy on training set: ", clf.score(X_train_2, y_train_2))