# Intrusion Detection System using Clustering

## Importing packages 

In [2]:

import pyspark.sql.functions as funcs
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [None]:
#  create spark session that will run locally using all available cores
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.getOrCreate()

#  read csv file
iris = spark.read \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.option("inferSchema", "True")\
.load("./data/TrainDf.csv")

#  print schema
iris.printSchema()

# Data preparation and visualization

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

# create label indexer with input column status and output column label
label_indexer = StringIndexer(inputCol = "status", outputCol = "label")

# create label indexer model and transform iris dataframe
label_indexer_model = label_indexer.fit(iris)
# define the new dataframe with transformed data
new_df = label_indexer_model.transform(iris)

In [None]:
# create feature columns 
feature_cols = iris.columns[:-1]

# create vector assembler with input columns and output column vec_features
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'vec_features')
assembler_df = assembler.transform(new_df)

# create normal dataframe with status normal
normal = assembler_df.where(funcs.col("status") == "normal")

In [None]:
# import PCA from ml.feature tha will reduce the dimensionality of the data
from pyspark.ml.feature import PCA

# create PCA with k=9 and input column vec_features and output column features
# k=9 because we have 9 features
pca = PCA(k=9, inputCol="vec_features", outputCol="features")

# create pca model and transform normal dataframe
pcaModel = pca.fit(normal)

# transform normal dataframe
normal_reduction_df = pcaModel.transform(normal)

In [None]:
# print the first 3 rows of the dataframe
normal_reduction_df.toPandas().head(3)

# Train Model (K-Means Clustering)

## Training of Data

In [None]:
# import KMeans from ml.clustering
from pyspark.ml.clustering import KMeans

# create kmeans with k=2 and input column features and output column prediction
k_num = 2
kmeans = KMeans(featuresCol='features',k=k_num, maxIter=100)

# create kmeans model and fit normal_reduction_df
model = kmeans.fit(normal_reduction_df)

In [None]:
# this will print the centers of the clusters
cost = model.computeCost(normal_reduction_df)
print("With K= ",k_num)
print("Within Set Sum of Squared Errors = " + str(cost))
print('--'*30)

# Prediction Training Dataset

In [None]:
##  
pca = PCA(k=9, inputCol="vec_features", outputCol="features")
pcaModel = pca.fit(assembler_df)
test_reduction_df = pcaModel.transform(assembler_df)

predictions = model.transform(test_reduction_df)
predictions = predictions.select("features","label","prediction")
predictions.toPandas().head()

#  Calculation of Silhouette Score