## Mounting Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading Data

In [None]:
import kagglehub
# Download latest version

path = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'network-intrusion-dataset' dataset.
Path to dataset files: /kaggle/input/network-intrusion-dataset


In [None]:
import os

# List the files inside the downloaded folder
print(os.listdir(path))


['Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv']


## Unified Data

In [None]:
import pandas as pd
import os

# Folder path (from kagglehub)
data_path = path

# Combine all CSVs into one dataframe
files = [f for f in os.listdir(data_path) if f.endswith(".csv")]
df_list = [pd.read_csv(os.path.join(data_path, f)) for f in files]
df = pd.concat(df_list, ignore_index=True)

print("Combined dataset shape:", df.shape)


Combined dataset shape: (2830743, 79)


In [None]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,1266342,41,44,2664,6954,456,0,64.97561,109.864573,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,22,1319353,41,44,2664,6954,456,0,64.97561,109.864573,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,22,160,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,22,1303488,41,42,2728,6634,456,0,66.536585,110.129945,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,35396,77,1,2,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


## Preprocessing

In [None]:
df.isnull().sum()

Unnamed: 0,0
Destination Port,0
Flow Duration,0
Total Fwd Packets,0
Total Backward Packets,0
Total Length of Fwd Packets,0
...,...
Idle Mean,0
Idle Std,0
Idle Max,0
Idle Min,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0    Destination Port             int64  
 1    Flow Duration                int64  
 2    Total Fwd Packets            int64  
 3    Total Backward Packets       int64  
 4   Total Length of Fwd Packets   int64  
 5    Total Length of Bwd Packets  int64  
 6    Fwd Packet Length Max        int64  
 7    Fwd Packet Length Min        int64  
 8    Fwd Packet Length Mean       float64
 9    Fwd Packet Length Std        float64
 10  Bwd Packet Length Max         int64  
 11   Bwd Packet Length Min        int64  
 12   Bwd Packet Length Mean       float64
 13   Bwd Packet Length Std        float64
 14  Flow Bytes/s                  float64
 15   Flow Packets/s               float64
 16   Flow IAT Mean                float64
 17   Flow IAT Std                 float64
 18   Flow IAT Max         

## Duplicate and Null Remover

In [None]:
# Drop duplicates and rows with missing values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print("After cleaning:", df.shape)


After cleaning: (2522009, 79)


In [None]:
df.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

## Rename Columns

In [None]:
df.rename(columns={' Label': 'Label'}, inplace=True)


In [None]:
# Normalize dash types
df['Label'] = df['Label'].str.replace('â€“', '-', regex=False)
df['Label'] = df['Label'].str.replace('â€”', '-', regex=False)

# Simplify web attack labels
df['Label'] = df['Label'].replace({
    'Web Attack - Brute Force': 'Web Attack',
    'Web Attack - XSS': 'Web Attack',
    'Web Attack - Sql Injection': 'Web Attack'
})

# Binary simplify
df['Label'] = df['Label'].apply(lambda x: 'Attack' if x != 'BENIGN' else 'Normal')

# Check distribution
print(df['Label'].value_counts())


Label
Normal    2096134
Attack     425875
Name: count, dtype: int64


In [None]:
drop_cols = ['Flow ID', 'Source IP', 'Source Port', 'Destination IP', 'Timestamp']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Label':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])


## Binary Encoding

In [None]:
X = df.drop('Label', axis=1)
y = df['Label'].map({'Normal': 0, 'Attack': 1})  # binary encoding


In [None]:
import numpy as np
print(np.isinf(X).sum())  # Count of infinite values
print(np.isnan(X).sum())  # Count of NaN values


 Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
 Active Min                    0
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
Length: 78, dtype: int64
 Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
 Active Min                    0
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
Length: 78, dtype: int64


In [None]:
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import pandas as pd

In [None]:


print("Checking infinities and NaNs in X:")
print("Infinities:", np.isinf(X).sum())
print("NaNs:", np.isnan(X).sum())

X = np.where(np.isinf(X), np.nan, X)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)


print("Original class distribution:", Counter(y))

Checking infinities and NaNs in X:
Infinities:  Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
 Active Min                    0
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
Length: 78, dtype: int64
NaNs:  Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
 Active Min                    0
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
Length: 78, dtype: int64
Original class distribution: Counter({0: 2096134, 1: 425875})


## Undersampling and Class Balancing

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from cuml.preprocessing import StandardScaler as cuStandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


In [None]:
from cuml.preprocessing import StandardScaler as cuStandardScaler
from cuml.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=0.7, random_state=42)
X_under, y_under = rus.fit_resample(X, y)


ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_under, y_under)


scaler = cuStandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)






In [None]:
scaler = cuStandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)


In [None]:
X_final = pd.DataFrame(X_resampled_scaled, columns=df.drop('Label', axis=1).columns)
y_final = pd.Series(y_resampled, name='Label')
df_final = pd.concat([X_final, y_final], axis=1)




In [None]:
print(f"The dataset has {df_final.shape[0]} rows and {df_final.shape[1]} columns.")

The dataset has 1216784 rows and 79 columns.


## Saving Preprocessed Data

In [None]:
output_path = "/content/drive/MyDrive/cnet_preprocessed_data.csv"

df_final.to_csv(output_path, index=False)
print(f"File saved at: {output_path}")

File saved at: /content/drive/MyDrive/cnet_preprocessed_data.csv


In [None]:
import pandas as pd

In [None]:
data_path = "/content/drive/MyDrive/cnet_preprocessed_data.csv"

df = pd.read_csv(data_path)

## Test Train Split

In [None]:
from cuml.model_selection import train_test_split

X = df.drop('Label', axis=1)
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (973428, 78) Test shape: (243356, 78)


## Principle Component Analysis

In [None]:
from cuml.decomposition import PCA

pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Reduced feature dimensions:", X_train_pca.shape[1])

Reduced feature dimensions: 50


## Evaluation Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import cupy as cp # Import cupy

def evaluate_model(y_true, y_pred, name):
    # Ensure inputs are on CPU for scikit-learn metrics
    # Convert y_true to numpy array if it's a pandas/cudf Series or cupy array
    if hasattr(y_true, 'to_numpy'):
        y_true_np = y_true.to_numpy()
    elif isinstance(y_true, cp.ndarray):
        y_true_np = y_true.get()
    else:
        y_true_np = y_true

    # Convert y_pred to numpy array if it's a pandas/cudf Series or cupy array
    if hasattr(y_pred, 'to_numpy'):
        y_pred_np = y_pred.to_numpy()
    elif isinstance(y_pred, cp.ndarray):
        y_pred_np = y_pred.get()
    else:
        y_pred_np = y_pred

    acc = accuracy_score(y_true_np, y_pred_np)
    prec = precision_score(y_true_np, y_pred_np, average='weighted')
    rec = recall_score(y_true_np, y_pred_np, average='weighted')
    f1 = f1_score(y_true_np, y_pred_np, average='weighted')
    print(f"\nðŸ“Š {name} Results:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")
    return [name, acc, prec, rec, f1]

## KNN

In [None]:
from cuml.neighbors import KNeighborsClassifier
import joblib

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)
y_pred_knn = knn.predict(X_test_pca)

result_knn = evaluate_model(y_test, y_pred_knn, "GPU KNN")

# Save model
joblib.dump(knn, "/content/drive/MyDrive/gpu_knn_model.pkl")
print("KNN model saved to Drive.")


ðŸ“Š GPU KNN Results:
Accuracy : 0.9888
Precision: 0.9888
Recall   : 0.9888
F1 Score : 0.9888
KNN model saved to Drive.


## Random Forest

In [None]:
from cuml.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=16, random_state=42)
rf.fit(X_train_pca, y_train)
y_pred_rf = rf.predict(X_test_pca)

result_rf = evaluate_model(y_test, y_pred_rf, "GPU Random Forest")

# Save model
joblib.dump(rf, "/content/drive/MyDrive/gpu_randomforest_model.pkl")
print("Random Forest model saved to Drive.")


ðŸ“Š GPU Random Forest Results:
Accuracy : 0.9979
Precision: 0.9979
Recall   : 0.9979
F1 Score : 0.9979
Random Forest model saved to Drive.


## Navie Bayes

In [None]:
from cuml.naive_bayes import GaussianNB
import joblib
import cupy as cp

# Ensure inputs are CuPy arrays for GaussianNB
X_train_pca_cp = X_train_pca.to_cupy()
y_train_cp = y_train.to_cupy()
X_test_pca_cp = X_test_pca.to_cupy()

nb = GaussianNB()
nb.fit(X_train_pca_cp, y_train_cp)
y_pred_nb = nb.predict(X_test_pca_cp)

# The evaluate_model function already handles conversion to numpy for scikit-learn metrics,
# so y_pred_nb (which is likely a cupy.ndarray) will be correctly processed.
result_nb = evaluate_model(y_test, y_pred_nb, "GPU Naive Bayes")

joblib.dump(nb, "/content/drive/MyDrive/gpu_naivebayes_model.pkl")
print("Naive Bayes model saved to Drive.")


ðŸ“Š GPU Naive Bayes Results:
Accuracy : 0.5645
Precision: 0.7135
Recall   : 0.5645
F1 Score : 0.4724
Naive Bayes model saved to Drive.
