# Intrusion Detection based Anomaly method using Classification algorithms 
- Decision Tree, 

- Random Forest Tree, 

- Gradient Boost Tree, 

- Naive Bayes 

- Logistic Regression

### Importing Packages and configuring spark engine

In [1]:
import pyspark.sql.functions as funcs
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.config("spark.driver.memory","3g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

In [3]:
'''logger = spark.sparkContext._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'''

'logger = spark.sparkContext._jvm.org.apache.log4j\nlogger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)\nlogger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'

# 1. Load Dataset

In [4]:
iris = spark.read \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.option("inferSchema", "True")\
.load("TrainDf.csv")

In [5]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

# 2. Data Preparation 

In [6]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [7]:
feature_cols = iris.columns[:-1]

In [8]:
label_indexer = StringIndexer(inputCol = "status", outputCol = "label")

In [9]:
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')

In [10]:
pipe = Pipeline(stages=[assembler, label_indexer])
pipe_model = pipe.fit(iris)

In [11]:
data = pipe_model.transform(iris)
data = data.select("features","label")

In [12]:
train, test = data.randomSplit([0.70, 0.30])

# 3. Train Model

### 3.1 Decision Tree Algorithm

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

#### 3.1.1 Training and Predicting of Model

In [14]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
modeldt = dt.fit(train)
predictiondt = modeldt.transform(test)
predictiondt.toPandas().head()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[654.0, 48.0]","[0.9316239316239316, 0.06837606837606838]",0.0
1,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[654.0, 48.0]","[0.9316239316239316, 0.06837606837606838]",0.0
2,"(1.0, 259.0, 901953.0, 0.0, 0.0, 0.0, 1.0, 0.0...",0.0,"[178.0, 40.0]","[0.8165137614678899, 0.1834862385321101]",0.0
3,"(2.0, 54540.0, 8314.0, 0.0, 0.0, 0.0, 2.0, 0.0...",1.0,"[12.0, 871.0]","[0.013590033975084938, 0.986409966024915]",1.0
4,"(5.0, 54540.0, 8314.0, 0.0, 0.0, 0.0, 2.0, 0.0...",1.0,"[12.0, 871.0]","[0.013590033975084938, 0.986409966024915]",1.0


#### 3.1.2 Confusion Matrix of Decision Tree

In [15]:
predictiondt.select("prediction", "label")\
.groupBy("prediction", "label").count()\
.orderBy("prediction", "label", ascending=True).withColumn("status",
funcs.when(funcs.col("label").isin(1), "Anomaly")\
.otherwise("Normal")).toPandas().head()

Unnamed: 0,prediction,label,count,status
0,0.0,0.0,20104,Normal
1,0.0,1.0,259,Anomaly
2,1.0,0.0,113,Normal
3,1.0,1.0,17263,Anomaly


In [16]:
predictiondt.groupBy(["label","prediction"]).count().toPandas().head()

Unnamed: 0,label,prediction,count
0,1.0,1.0,17263
1,0.0,1.0,113
2,1.0,0.0,259
3,0.0,0.0,20104


#### 3.1.3 Calculation of Accuracy

In [17]:
evaluatordt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
dt = evaluatordt.evaluate(predictiondt)

print("--- Decision Tree --- ")
print("Accuracy Rate =", round(dt,4))
print("  Error  Rate = %g " % round((1.0 - dt),4))

--- Decision Tree --- 
Accuracy Rate = 0.9901
  Error  Rate = 0.0099 


In [18]:
predictionAndLabel = predictiondt.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+
| normal|anomaly|
+-------+-------+
|20104.0|  113.0|
|  259.0|17263.0|
+-------+-------+



In [19]:
predictiondt.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+-----+
|prediction_label|  0.0|  1.0|
+----------------+-----+-----+
|             1.0|  113|17263|
|             0.0|20104|  259|
+----------------+-----+-----+



### 3.2 Random Forest Algorithm

In [20]:
from pyspark.ml.classification import RandomForestClassifier

#### 3.2.1 Training and Predicting of Model

In [21]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
modelrf = rf.fit(train)
predictionrf = modelrf.transform(test)
predictionrf.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[8.681892321918907, 1.3181076780810923]","[0.8681892321918907, 0.13181076780810924]",0.0
1,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[8.175290104858725, 1.8247098951412761]","[0.8175290104858725, 0.1824709895141276]",0.0
2,"(1.0, 259.0, 901953.0, 0.0, 0.0, 0.0, 1.0, 0.0...",0.0,"[8.819719058928754, 1.180280941071244]","[0.8819719058928756, 0.11802809410712442]",0.0


#### 3.2.2 Calculation of Accuracy

In [22]:
evaluatorrf = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
rf = evaluatorrf.evaluate(predictionrf)

print("--- Random Forest Tree --- ")
print("Accuracy Rate =", round(rf,4))
print("  Error  Rate = %g " % round((1.0 - rf),4))

--- Random Forest Tree --- 
Accuracy Rate = 0.9821
  Error  Rate = 0.0179 


#### 3.2.3 Confusion Matrix of Decision Tree

In [23]:
predictionAndLabel = predictionrf.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+
| normal|anomaly|
+-------+-------+
|20123.0|   94.0|
|  581.0|16941.0|
+-------+-------+



In [24]:
predictionAndLabels = predictionrf.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix()

DenseMatrix(2, 2, [20123.0, 581.0, 94.0, 16941.0], 0)

In [25]:
predictionrf.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+-----+
|prediction_label|  0.0|  1.0|
+----------------+-----+-----+
|             1.0|   94|16941|
|             0.0|20123|  581|
+----------------+-----+-----+



In [26]:
predictionrfevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
predictionrfevaluator.evaluate(predictionrf)      

predictionrf.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|16941|
|  0.0|       1.0|   94|
|  1.0|       0.0|  581|
|  0.0|       0.0|20123|
+-----+----------+-----+



### 3.3 Naive Bayes Algorithm

In [27]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### 3.3.1 Training and Predicting of Model

In [28]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

modelnb = nb.fit(train)
predictionnb = modelnb.transform(test)
predictionnb.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[-24532.222984350396, -63788.003361581854]","[1.0, 0.0]",0.0
1,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[-7013.151465440639, -16872.072534181578]","[1.0, 0.0]",0.0
2,"(1.0, 259.0, 901953.0, 0.0, 0.0, 0.0, 1.0, 0.0...",0.0,"[-1377373.2928589834, -3822601.502826015]","[1.0, 0.0]",0.0


#### 3.3.2 Calculation of Accuracy

In [29]:
evaluatornb = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
nb = evaluatornb.evaluate(predictionnb)

print("--- Naive Bayes --- ")
print("Accuracy Rate =", round(nb,4))
print("  Error  Rate = %g " % round((1.0 - nb),4))

--- Naive Bayes --- 
Accuracy Rate = 0.5186
  Error  Rate = 0.4814 


### 3.4 Gradient Boost Tree

In [30]:
from pyspark.ml.classification import GBTClassifier

#### 3.4.1 Training and Predicting of Model

In [31]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

modelgbt = gbt.fit(train)
predictiongbt = modelgbt.transform(test)
predictiongbt.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[0.047137151485949685, -0.047137151485949685]","[0.523551135483466, 0.476448864516534]",0.0
1,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[-0.403685284087618, 0.403685284087618]","[0.30845109453864805, 0.691548905461352]",1.0
2,"(1.0, 259.0, 901953.0, 0.0, 0.0, 0.0, 1.0, 0.0...",0.0,"[0.8948856274584156, -0.8948856274584156]","[0.856899229415159, 0.14310077058484105]",0.0


#### 3.4.2 Calculation of Accuracy

In [32]:
evaluatorgbt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt = evaluatorgbt.evaluate(predictiongbt)

print("--- Gradient Boost Tree --- ")
print("Accuracy Rate =", round(gbt,4))
print("  Error  Rate = %g " % round((1.0 - gbt),4))

--- Gradient Boost Tree --- 
Accuracy Rate = 0.9908
  Error  Rate = 0.0092 


### 3.5 Logistic Regression

In [33]:
from pyspark.ml.classification import LogisticRegression

#### 3.5.1 Training and Predicting of Model

In [34]:
lr = LogisticRegression(regParam=0.01)
modellr = lr.fit(train)
predictionlr = modellr.transform(test)
predictionlr.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[-0.1709324425779426, 0.1709324425779426]","[0.45737063389216565, 0.5426293661078343]",1.0
1,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[0.12584733567313067, -0.12584733567313067]","[0.5314203763733614, 0.4685796236266385]",0.0
2,"(1.0, 259.0, 901953.0, 0.0, 0.0, 0.0, 1.0, 0.0...",0.0,"[3.640972855869962, -3.640972855869962]","[0.9744434503047767, 0.02555654969522331]",0.0


#### 3.5.2 Calculation of Accuracy

In [35]:
evaluatorlr = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
lr = evaluatorlr.evaluate(predictionlr)

In [36]:
print("--- Logistic Regression --- ")
print("Accuracy Rate =", round(lr,4))
print("  Error  Rate = %g " % round((1.0 - lr),4))

--- Logistic Regression --- 
Accuracy Rate = 0.9462
  Error  Rate = 0.0538 


## 3.6 Comparison of Accucary  Rate of Algorithms

In [37]:
print("Gradient Boost Tree Accuracy =", round(gbt,5))
print("      Decision Tree Accuracy =", round(dt,5))
print(" Random Forest Tree Accuracy =", round(rf,5))
print("Logistic Regression Accuracy =", round(lr,5))
print("        Naive Bayes Accuracy =", round(nb,5))

Gradient Boost Tree Accuracy = 0.99081
      Decision Tree Accuracy = 0.99014
 Random Forest Tree Accuracy = 0.98211
Logistic Regression Accuracy = 0.94616
        Naive Bayes Accuracy = 0.51861


# 4. Streaming Process

In [38]:
schema = StructType(
[
    StructField("duration", FloatType(), True),
    StructField("src_bytes", FloatType(), True),
    StructField("dst_bytes", FloatType(), True),
    StructField("land", FloatType(), True),
    StructField("wrong_fragment", FloatType(), True),
    StructField("urgent", FloatType(), True),
    StructField("hot", FloatType(), True),
    StructField("num_failed_logins", FloatType(), True),
    StructField("logged_in", FloatType(), True),
    StructField("num_compromised", FloatType(), True),
    StructField("root_shell", FloatType(), True),
    StructField("su_attempted", FloatType(), True),
    StructField("num_root", FloatType(), True),
    StructField("num_file_creations", FloatType(), True),
    StructField("num_shells", FloatType(), True),
    StructField("num_access_files", FloatType(), True),
    StructField("num_outbound_cmds", FloatType(), True),
    StructField("is_host_login", FloatType(), True),
    StructField("is_guest_login", FloatType(), True),
    StructField("count", FloatType(), True),
    StructField("srv_count", FloatType(), True),
    StructField("serror_rate", FloatType(), True),
    StructField("srv_serror_rate", FloatType(), True),
    StructField("rerror_rate", FloatType(), True),
    StructField("srv_rerror_rate", FloatType(), True),
    StructField("same_srv_rate", FloatType(), True),
    StructField("diff_srv_rate", FloatType(), True),
    StructField("srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_count", FloatType(), True),
    StructField("dst_host_srv_count", FloatType(), True),
    StructField("dst_host_same_srv_rate", FloatType(), True),
    StructField("dst_host_diff_srv_rate", FloatType(), True),
    StructField("dst_host_same_src_port_rate", FloatType(), True),
    StructField("dst_host_srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_serror_rate", FloatType(), True),
    StructField("dst_host_srv_serror_rate", FloatType(), True),
    StructField("dst_host_rerror_rate", FloatType(), True),
    StructField("dst_host_srv_rerror_rate", FloatType(), True),
    
    StructField("status", StringType(), True)
])

### 5.1 Reading Streaming Data

In [39]:
iris_data = spark.readStream \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.schema(schema)\
.load("data")\

In [40]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

In [41]:
features_array = iris_data.selectExpr("""array(
CAST(duration AS FLOAT),
CAST(src_bytes AS FLOAT), 
CAST(dst_bytes AS FLOAT), 
CAST(land AS FLOAT),
CAST(wrong_fragment AS FLOAT), 
CAST(urgent AS FLOAT), 
CAST(hot AS FLOAT), 
CAST(num_failed_logins AS FLOAT), 
CAST(logged_in AS FLOAT),
CAST(num_compromised AS FLOAT), 
CAST(root_shell AS FLOAT),
CAST(su_attempted AS FLOAT), 
CAST(num_root AS FLOAT),
CAST(num_file_creations AS FLOAT), 
CAST(num_shells  AS FLOAT), 
CAST(num_access_files  AS FLOAT),
CAST(num_outbound_cmds  AS FLOAT), 
CAST(is_host_login  AS FLOAT),
CAST(is_guest_login  AS FLOAT), 
CAST(count AS FLOAT), 
CAST(srv_count AS FLOAT),
CAST(serror_rate AS FLOAT), 
CAST(srv_serror_rate AS FLOAT),
CAST(rerror_rate AS FLOAT), 
CAST(srv_rerror_rate AS FLOAT),
CAST(same_srv_rate AS FLOAT),
CAST(diff_srv_rate AS FLOAT), 
CAST(srv_diff_host_rate AS FLOAT),
CAST(dst_host_count AS FLOAT),
CAST(dst_host_srv_count AS FLOAT), 
CAST(dst_host_same_srv_rate AS FLOAT),
CAST(dst_host_diff_srv_rate AS FLOAT),
CAST(dst_host_same_src_port_rate AS FLOAT),
CAST(dst_host_srv_diff_host_rate AS FLOAT), 
CAST(dst_host_serror_rate AS FLOAT),
CAST(dst_host_srv_serror_rate AS FLOAT),
CAST(dst_host_rerror_rate AS FLOAT), 
CAST(dst_host_srv_rerror_rate AS FLOAT)

) as arr""", 
                                      "status")

### 5.2 Vectorization of streaming data

In [42]:
tovec_udf = funcs.udf(lambda r: Vectors.dense(r), VectorUDT())
data_stream = features_array.withColumn("features", tovec_udf("arr"))

# 5. Prediction Process

### 5.1 Prediction of Streaming Data

In [43]:
prediction = modelrf.transform(data_stream)

In [44]:
type(prediction)

pyspark.sql.dataframe.DataFrame

In [45]:
prediction.printSchema()

root
 |-- arr: array (nullable = false)
 |    |-- element: float (containsNull = true)
 |-- status: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### 5.2 Adding Sliding Window Time using Current Timestamp

In [46]:
currentTimeDf = prediction.withColumn("processingTime",funcs.current_timestamp())

# 6. Start Streaming

### 6.1 Option 1 - Using Sliding Windows and Watermarking (Confusion Matrix)

In [50]:
confusion_matrix = currentTimeDf\
.withWatermark("processingTime", "5 seconds")\
.groupBy(funcs.window("processingTime", "3 seconds", "1 seconds"),"status", "prediction")\
.count()\
.withColumn("prediction",funcs.when(funcs.col("prediction").isin(1.0), "anomaly")\
.otherwise("normal"))\
.orderBy("window")

In [51]:
q = confusion_matrix.writeStream\
.outputMode("complete")\
.format("console")\
.option("truncate", "false")\
.start()

In [52]:
q.awaitTermination()

KeyboardInterrupt: 

### 6.2 Option 2 - Using Append method

In [74]:
prediction = prediction.select("features","status","prediction")

In [75]:
q = prediction.writeStream\
.outputMode("append")\
.format("console")\
.start()

In [None]:
q.awaitTermination()