<a href="https://colab.research.google.com/github/chanshekk/Real-Time-Intrusion-Detection-System-for-Cloud-Security-using-Spark/blob/main/Real_Time_Intrusion_Detection_System_for_Cloud_Security_using_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load training and testing datasets
train_df = pd.read_csv("/content/drive/MyDrive/UNSW_NB15_Dataset/UNSW_NB15_training-set.csv")
test_df = pd.read_csv("/content/drive/MyDrive/UNSW_NB15_Dataset/UNSW_NB15_testing-set.csv")

# Merge datasets vertically (row-wise)
merged_df = pd.concat([train_df, test_df], ignore_index=True)

# Save the merged dataset to a new CSV file
merged_df.to_csv("UNSW_NB15_full.csv", index=False)

print("Merged dataset saved as 'UNSW_NB15_full.csv'")
print("Shape of merged dataset:", merged_df.shape)


Merged dataset saved as 'UNSW_NB15_full.csv'
Shape of merged dataset: (257673, 45)


In [2]:
# ================== Part 0: Setup and Preprocessing ==================
!pip install -q pyspark

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, VectorSlicer
from pyspark.ml.classification import DecisionTreeClassifier as SparkDTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# Load dataset
df = pd.read_csv("/content/UNSW_NB15_full.csv")
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['attack_cat'])

drop_cols = ['id', 'attack_cat', 'label_name', 'proto', 'service', 'state']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

X = df.drop("label", axis=1)
y = df["label"]

# Handle NaNs
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Standardize features for non-FS parts
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [4]:
# ================== Part 1: Execution without Feature Selection on Spark ==================
start1 = time.time()

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
model1 = DecisionTreeClassifier(max_depth=10)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

end1 = time.time()

print("\n=== Part 1: Execution without Feature Selection on Spark ===")
print("Execution Time: {:.2f} seconds".format(end1 - start1))
print("Accuracy:", accuracy_score(y_test, y_pred1))
print("F1-score:", f1_score(y_test, y_pred1, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))



=== Part 1: Execution without Feature Selection on Spark ===
Execution Time: 5.49 seconds
Accuracy: 0.7989832087138754
F1-score: 0.7731410561786786
Confusion Matrix:
 [[   39     1    13   515    81     0   135     0     0     0]
 [    0    17    13   554    75     8     9    12     8     0]
 [    2     3   303  4385   143    13    63    31    15     0]
 [    1    11   193 12130   408    57   366   132    28     2]
 [    0     4    26  1365  3211     1  2565    12    40     0]
 [    0     3    21   285    39 17223    11     6     1     0]
 [    0     0    13   628  1406     1 25913    35    29     1]
 [    0     0    26  1110   104     1    94  2850     8     0]
 [    0     0    17   282    35     2    22    31    68     0]
 [    0     0     0    34     2     1     1     0     0     9]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.05      0.09       784
           1       0.44      0.02      0.05       696
           2  

In [5]:
# ================== Part 2: Execution without Feature Selection ==================
start2 = time.time()

spark = SparkSession.builder.appName("UNSW-NB15").getOrCreate()
spark_df = spark.createDataFrame(pd.DataFrame(X_scaled, columns=X.columns).assign(label=y.values))
spark_df = spark_df.na.drop()

assembler = VectorAssembler(inputCols=X.columns.tolist(), outputCol="features")
spark_df = assembler.transform(spark_df)

train_spark, test_spark = spark_df.randomSplit([0.7, 0.3], seed=42)
dt_spark = SparkDTClassifier(labelCol="label", featuresCol="features", maxDepth=10)
model2 = dt_spark.fit(train_spark)
pred2 = model2.transform(test_spark)

evaluator2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
pred_df2 = pred2.select("label", "prediction").toPandas()

end2 = time.time()

print("\n=== Part 2: Execution Without Feature Selection ===")
print("Execution Time: {:.2f} seconds".format(end2 - start2))
print("Accuracy:", accuracy_score(pred_df2["label"], pred_df2["prediction"]))
print("F1-score:", evaluator2.evaluate(pred2))
print("Confusion Matrix:\n", confusion_matrix(pred_df2["label"], pred_df2["prediction"]))
print("Classification Report:\n", classification_report(pred_df2["label"], pred_df2["prediction"]))



=== Part 2: Execution Without Feature Selection ===
Execution Time: 156.06 seconds
Accuracy: 0.7997100999119946
F1-score: 0.7698961454335016
Confusion Matrix:
 [[   47     0     0   600    75     0   128     0     0     0]
 [    0    21     5   579    80     1    14     8     7     0]
 [    1     2   179  4420   138     7    68    28    42     0]
 [    7     3    55 12166   480     9   359   254    53     0]
 [    4     0    10  1037  3033     1  3098    54    39     0]
 [    0     1    32   334    32 17159    17     2     8     0]
 [    7     0     7   463  1277     0 26131    60    36     0]
 [    0     1     4  1008    80     0    85  2900     8     0]
 [    0     0    15   172    42     1    25    36   156     0]
 [    0     0     0    48     8     0     1     0     0     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.06      0.10       850
           1       0.75      0.03      0.06       715
           2       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# ================== Part 3: Execution With Feature Selection on Spark ==================
start3 = time.time()

minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)

selector = SelectKBest(score_func=chi2, k=20)
X_selected = selector.fit_transform(X_minmax, y)

X_train_fs, X_test_fs, y_train_fs, y_test_fs = train_test_split(X_selected, y, test_size=0.3, random_state=42)
model3 = DecisionTreeClassifier(max_depth=10)
model3.fit(X_train_fs, y_train_fs)
y_pred3 = model3.predict(X_test_fs)

end3 = time.time()

print("\n=== Part 3: Execution With Feature Selection on Spark===")
print("Execution Time: {:.2f} seconds".format(end3 - start3))
print("Accuracy:", accuracy_score(y_test_fs, y_pred3))
print("F1-score:", f1_score(y_test_fs, y_pred3, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test_fs, y_pred3))
print("Classification Report:\n", classification_report(y_test_fs, y_pred3))



=== Part 3: Execution With Feature Selection on Spark===
Execution Time: 2.32 seconds
Accuracy: 0.8039248661095444
F1-score: 0.7719196440795272
Confusion Matrix:
 [[   34     0     3   490   116     0   141     0     0     0]
 [    0    23     5   529   112     0    16     7     4     0]
 [    3     8   152  4396   144    17   143    30    65     0]
 [    6    25   112 12056   334    46   434   247    68     0]
 [    0     2    35   900  2903     7  3283    75    19     0]
 [    0     2    48   315    31 17158    23     5     7     0]
 [    0     0     9   407   938     3 26587    61    21     0]
 [    0     1     3   964    17     3   148  3050     7     0]
 [    0     0    10   102    41     3    69    50   182     0]
 [    0     0     9    34     2     0     2     0     0     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.04      0.08       784
           1       0.38      0.03      0.06       696
           2      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# ================== Part 4: Execution with Feature Selection ==================
start4 = time.time()

selected_indices = selector.get_support(indices=True).tolist()
slicer = VectorSlicer(inputCol="features", outputCol="selectedFeatures", indices=selected_indices)
sliced_df = slicer.transform(spark_df)

train_sel, test_sel = sliced_df.randomSplit([0.7, 0.3], seed=42)
dt_sel = SparkDTClassifier(labelCol="label", featuresCol="selectedFeatures", maxDepth=10)
model4 = dt_sel.fit(train_sel)
pred4 = model4.transform(test_sel)

evaluator4 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
pred_df4 = pred4.select("label", "prediction").toPandas()

end4 = time.time()

print("\n=== Part 4: Spark With Feature Selection ===")
print("Execution Time: {:.2f} seconds".format(end4 - start4))
print("Accuracy:", accuracy_score(pred_df4["label"], pred_df4["prediction"]))
print("F1-score:", evaluator4.evaluate(pred4))
print("Confusion Matrix:\n", confusion_matrix(pred_df4["label"], pred_df4["prediction"]))
print("Classification Report:\n", classification_report(pred_df4["label"], pred_df4["prediction"]))



=== Part 4: Spark With Feature Selection ===
Execution Time: 65.86 seconds
Accuracy: 0.800137184863074
F1-score: 0.7709684910643314
Confusion Matrix:
 [[   37     0    20   584    60     0   148     1     0     0]
 [    0    29    16   560    79     0    20     4     7     0]
 [    2     3   179  4323   171     3   107    27    70     0]
 [    9    12   181 11915   454    21   500   223    71     0]
 [    3     7    18   968  3086     5  3080    85    24     0]
 [    0     2    29   346    70 17116    12     1     9     0]
 [    1     3     8   390  1130     1 26354    56    38     0]
 [    0     1    63   890    61     0   129  2923    19     0]
 [    0     0    38    76    58     0    54    35   186     0]
 [    0     2     1    40    13     0     1     0     0     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.04      0.08       850
           1       0.49      0.04      0.07       715
           2       0.32      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
