In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
# Directory containing the dataset
DATASET_DIRECTORY = 'C:/Users/EDWIN/Downloads/CICIoT2023'

In [7]:
def load_and_preprocess_data_sample(dataset_directory, sample_size=0.1, chunk_size=10000):
    df_sets = [k for k in os.listdir(dataset_directory) if k.endswith('.csv')]
    df_sets.sort()

    scaler = StandardScaler()
    label_encoder = LabelEncoder()
    X_list = []
    y_list = []

    for file in tqdm(df_sets):
        file_path = os.path.join(dataset_directory, file)
        total_rows = sum(1 for _ in open(file_path)) - 1  # Subtract 1 for header
        rows_to_sample = int(total_rows * sample_size)
        skip_rows = sorted(np.random.choice(range(1, total_rows + 1), total_rows - rows_to_sample, replace=False))
        
        for chunk in pd.read_csv(file_path, chunksize=chunk_size, skiprows=skip_rows):
            chunk.dropna(inplace=True)

            # Assuming the last column is the label
            X_chunk = chunk.iloc[:, :-1].values
            y_chunk = chunk.iloc[:, -1].values

            X_list.append(X_chunk)
            y_list.append(y_chunk)

    # Concatenate the processed chunks
    X = np.concatenate(X_list, axis=0)
    y = np.concatenate(y_list, axis=0)

    # Normalize the features using the scaler
    X = scaler.fit_transform(X)

    # Encode the labels if not numeric
    y = label_encoder.fit_transform(y)

    return X, y, label_encoder

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [9]:
# Load and preprocess data
X, y, label_encoder = load_and_preprocess_data_sample(DATASET_DIRECTORY)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Train the model
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display a detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

100%|██████████| 169/169 [01:15<00:00,  2.25it/s]


Accuracy: 99.37%
                         precision    recall  f1-score   support

       Backdoor_Malware       0.48      0.53      0.50        57
          BenignTraffic       0.93      0.92      0.92     22044
       BrowserHijacking       0.62      0.57      0.59       119
       CommandInjection       0.56      0.59      0.57       118
 DDoS-ACK_Fragmentation       1.00      1.00      1.00      5650
        DDoS-HTTP_Flood       1.00      1.00      1.00       569
        DDoS-ICMP_Flood       1.00      1.00      1.00    143970
DDoS-ICMP_Fragmentation       1.00      1.00      1.00      9073
      DDoS-PSHACK_Flood       1.00      1.00      1.00     81822
       DDoS-RSTFINFlood       1.00      1.00      1.00     81421
         DDoS-SYN_Flood       1.00      1.00      1.00     81043
         DDoS-SlowLoris       0.99      0.99      0.99       430
DDoS-SynonymousIP_Flood       1.00      1.00      1.00     71731
         DDoS-TCP_Flood       1.00      1.00      1.00     90327
       