<a href="https://colab.research.google.com/github/chanshekk/Real-Time-Intrusion-Detection-System-for-Cloud-Security-using-Spark/blob/main/Real_Time_Intrusion_Detection_System_for_Cloud_Security_using_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load training and testing datasets
train_df = pd.read_csv("/content/drive/MyDrive/UNSW_NB15_Dataset/UNSW_NB15_training-set.csv")
test_df = pd.read_csv("/content/drive/MyDrive/UNSW_NB15_Dataset/UNSW_NB15_testing-set.csv")

# Merge datasets vertically (row-wise)
merged_df = pd.concat([train_df, test_df], ignore_index=True)

# Save the merged dataset to a new CSV file
merged_df.to_csv("UNSW_NB15_full.csv", index=False)

print("Merged dataset saved as 'UNSW_NB15_full.csv'")
print("Shape of merged dataset:", merged_df.shape)


Merged dataset saved as 'UNSW_NB15_full.csv'
Shape of merged dataset: (257673, 45)


In [None]:
# ================== Part 0: Setup and Preprocessing ==================
!pip install -q pyspark

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, VectorSlicer
from pyspark.ml.classification import DecisionTreeClassifier as SparkDTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Load dataset
df = pd.read_csv("/content/UNSW_NB15_full.csv")
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['attack_cat'])

drop_cols = ['id', 'attack_cat', 'label_name', 'proto', 'service', 'state']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

X = df.drop("label", axis=1)
y = df["label"]

# Handle NaNs
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Standardize features for non-FS parts
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# ================== Part 1: Execution without Feature Selection on Spark ==================
start1 = time.time()

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
model1 = DecisionTreeClassifier(max_depth=10)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

end1 = time.time()

print("\n=== Part 1: Execution without Feature Selection on Spark ===")
print("Execution Time: {:.2f} seconds".format(end1 - start1))
print("Accuracy:", accuracy_score(y_test, y_pred1))
print("F1-score:", f1_score(y_test, y_pred1, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))



=== Part 1: Execution without Feature Selection on Spark ===
Execution Time: 8.33 seconds
Accuracy: 0.7989573361620657
F1-score: 0.7731619744194524
Confusion Matrix:
 [[   39     1    13   515    81     0   135     0     0     0]
 [    0    17    14   553    75     8     9    12     8     0]
 [    2     3   307  4383   143    11    63    31    15     0]
 [    1    11   193 12128   408    57   367   132    29     2]
 [    0     4    26  1364  3211     1  2566    12    40     0]
 [    0     3    21   286    39 17222    11     6     1     0]
 [    0     0    13   629  1406     1 25912    35    29     1]
 [    0     0    26  1110   104     1    94  2850     8     0]
 [    0     0    17   282    35     3    22    32    66     0]
 [    0     0     0    34     2     1     1     0     0     9]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.05      0.09       784
           1       0.44      0.02      0.05       696
           2  

In [None]:
# ================== Part 2: Execution without Feature Selection ==================
start2 = time.time()

spark = SparkSession.builder.appName("UNSW-NB15").getOrCreate()
spark_df = spark.createDataFrame(pd.DataFrame(X_scaled, columns=X.columns).assign(label=y.values))
spark_df = spark_df.na.drop()

assembler = VectorAssembler(inputCols=X.columns.tolist(), outputCol="features")
spark_df = assembler.transform(spark_df)

train_spark, test_spark = spark_df.randomSplit([0.7, 0.3], seed=42)
dt_spark = SparkDTClassifier(labelCol="label", featuresCol="features", maxDepth=10)
model2 = dt_spark.fit(train_spark)
pred2 = model2.transform(test_spark)

evaluator2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
pred_df2 = pred2.select("label", "prediction").toPandas()

end2 = time.time()

print("\n=== Part 2: Execution Without Feature Selection ===")
print("Execution Time: {:.2f} seconds".format(end2 - start2))
print("Accuracy:", accuracy_score(pred_df2["label"], pred_df2["prediction"]))
print("F1-score:", evaluator2.evaluate(pred2))
print("Confusion Matrix:\n", confusion_matrix(pred_df2["label"], pred_df2["prediction"]))
print("Classification Report:\n", classification_report(pred_df2["label"], pred_df2["prediction"]))



=== Part 2: Execution Without Feature Selection ===
Execution Time: 159.19 seconds
Accuracy: 0.7995289123569913
F1-score: 0.7631682791661978
Confusion Matrix:
 [[   44     0     0   626    41     0   134     5     0     0]
 [    0    43     6   567    47     1    27    13    11     0]
 [    3     2   141  4317    90    19   175    80    58     0]
 [    8    10    42 12171   332    25   530   209    55     4]
 [    4     0     5  1024  2285     1  3885    42    29     1]
 [    0     5    23   325    13 17169    32     8     7     3]
 [    7     0     0   462   653     0 26775    51    32     1]
 [    0     2     3   930    80     1   116  2943    11     0]
 [    0     0     1   102    36     8    51    56   193     0]
 [    0     0     0    38     4     0     1     0     0    14]]
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.05      0.10       850
           1       0.69      0.06      0.11       715
           2       0.

In [None]:
# ================== Part 3: Execution With Feature Selection on Spark ==================
start3 = time.time()

minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)

selector = SelectKBest(score_func=chi2, k=20)
X_selected = selector.fit_transform(X_minmax, y)

X_train_fs, X_test_fs, y_train_fs, y_test_fs = train_test_split(X_selected, y, test_size=0.3, random_state=42)
model3 = DecisionTreeClassifier(max_depth=10)
model3.fit(X_train_fs, y_train_fs)
y_pred3 = model3.predict(X_test_fs)

end3 = time.time()

print("\n=== Part 3: Execution With Feature Selection on Spark===")
print("Execution Time: {:.2f} seconds".format(end3 - start3))
print("Accuracy:", accuracy_score(y_test_fs, y_pred3))
print("F1-score:", f1_score(y_test_fs, y_pred3, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test_fs, y_pred3))
print("Classification Report:\n", classification_report(y_test_fs, y_pred3))



=== Part 3: Execution With Feature Selection on Spark===
Execution Time: 1.66 seconds
Accuracy: 0.8039248661095444
F1-score: 0.7719664323254988
Confusion Matrix:
 [[   34     0     3   490   116     0   141     0     0     0]
 [    0    23     5   529   112     0    16     7     4     0]
 [    3     7   154  4396   147    13   145    28    65     0]
 [    6    25   111 12056   336    45   434   247    68     0]
 [    0     2    36   900  2902     7  3283    75    19     0]
 [    0     2    49   315    31 17157    23     5     7     0]
 [    0     0     9   407   938     3 26587    61    21     0]
 [    0     1     4   964    17     2   148  3050     7     0]
 [    0     0    10   102    41     3    69    50   182     0]
 [    0     0     9    34     2     0     2     0     0     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.04      0.08       784
           1       0.38      0.03      0.06       696
           2      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# ================== Part 4: Execution with Feature Selection ==================
start4 = time.time()

selected_indices = selector.get_support(indices=True).tolist()
slicer = VectorSlicer(inputCol="features", outputCol="selectedFeatures", indices=selected_indices)
sliced_df = slicer.transform(spark_df)

train_sel, test_sel = sliced_df.randomSplit([0.7, 0.3], seed=42)
dt_sel = SparkDTClassifier(labelCol="label", featuresCol="selectedFeatures", maxDepth=10)
model4 = dt_sel.fit(train_sel)
pred4 = model4.transform(test_sel)

evaluator4 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
pred_df4 = pred4.select("label", "prediction").toPandas()

end4 = time.time()

print("\n=== Part 4: Spark With Feature Selection ===")
print("Execution Time: {:.2f} seconds".format(end4 - start4))
print("Accuracy:", accuracy_score(pred_df4["label"], pred_df4["prediction"]))
print("F1-score:", evaluator4.evaluate(pred4))
print("Confusion Matrix:\n", confusion_matrix(pred_df4["label"], pred_df4["prediction"]))
print("Classification Report:\n", classification_report(pred_df4["label"], pred_df4["prediction"]))



=== Part 4: Spark With Feature Selection ===
Execution Time: 64.01 seconds
Accuracy: 0.8002536625770047
F1-score: 0.7698436204858861
Confusion Matrix:
 [[   41     0    16   605    60     0   127     1     0     0]
 [    0    30    15   556    78     0    20     8     8     0]
 [    1     3   139  4431   144     6    60    38    63     0]
 [   12    12   146 12167   396    44   336   202    71     0]
 [    3     7    13   999  2988     8  3115   107    36     0]
 [    0     2    14   370    43 17133    13     2     8     0]
 [    1     3     7   510  1079     4 26238    88    51     0]
 [    0     1    17   985    56     0    98  2919    10     0]
 [    0     0     8   116    54     1    47    42   179     0]
 [    0     2     1    44     7     0     1     0     2     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.05      0.09       850
           1       0.50      0.04      0.08       715
           2       0.37      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
