In [2]:
import findspark
findspark.init()

In [3]:
import pyspark.sql.functions as funcs
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.config("spark.driver.memory","3g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

In [5]:
'''logger = spark.sparkContext._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'''

'logger = spark.sparkContext._jvm.org.apache.log4j\nlogger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)\nlogger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'

## Predict on Stream

## 1. Load Dataset

In [6]:
iris = spark.read \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.option("inferSchema", "True")\
.load("TrainDf.csv")

In [7]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

## 2. Data Preparation 

In [8]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [9]:
feature_cols = iris.columns[:-1]

In [None]:
label_indexer = StringIndexer(inputCol = "status", outputCol = "label")

In [None]:
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')

In [None]:
pipe = Pipeline(stages=[assembler, label_indexer])
pipe_model = pipe.fit(iris)

In [None]:
data = pipe_model.transform(iris)
data = data.select("features","label")

In [None]:
train, test = data.randomSplit([0.70, 0.30])

## 3. Train Model

### 3.1 Decision Tree Algorithm

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
modeldt = dt.fit(train)
predictiondt = modeldt.transform(test)
predictiondt.toPandas().head()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(718.0, 1412.0, 25260.0, 0.0, 0.0, 0.0, 15.0, ...",1.0,"[7.0, 16.0]","[0.30434782608695654, 0.6956521739130435]",1.0
1,"(176.0, 1559.0, 2732.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[7.0, 16.0]","[0.30434782608695654, 0.6956521739130435]",1.0
2,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[7.0, 16.0]","[0.30434782608695654, 0.6956521739130435]",1.0
3,"(4.0, 54540.0, 8314.0, 0.0, 0.0, 0.0, 2.0, 0.0...",1.0,"[9.0, 867.0]","[0.010273972602739725, 0.9897260273972602]",1.0
4,"(6.0, 54540.0, 8314.0, 0.0, 0.0, 0.0, 2.0, 0.0...",1.0,"[9.0, 867.0]","[0.010273972602739725, 0.9897260273972602]",1.0


#### Confusion Matrix of Decision Tree

In [None]:
predictiondt.select("prediction", "label")\
.groupBy("prediction", "label").count()\
.orderBy("prediction", "label", ascending=True).withColumn("status",
funcs.when(funcs.col("label").isin(1), "Anomaly")\
.otherwise("Normal")).toPandas().head()

In [None]:
predictiondt.groupBy(["label","prediction"]).count().toPandas().head()

In [19]:
evaluatordt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
dt = evaluatordt.evaluate(predictiondt)

print("--- Decision Tree --- ")
print("Accuracy Rate =", round(dt,4))
print("  Error  Rate = %g " % round((1.0 - dt),4))

--- Decision Tree --- 
Accuracy Rate = 0.9902
  Error  Rate = 0.0098 


In [20]:
predictionAndLabel = predictiondt.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+
| normal|anomaly|
+-------+-------+
|19950.0|  126.0|
|  244.0|17522.0|
+-------+-------+



In [21]:
predictiondt.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+-----+
|prediction_label|  0.0|  1.0|
+----------------+-----+-----+
|             1.0|  126|17522|
|             0.0|19950|  244|
+----------------+-----+-----+



### 3.2 Random Forest Algorithm

In [22]:
from pyspark.ml.classification import RandomForestClassifier

In [23]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
modelrf = rf.fit(train)
predictionrf = modelrf.transform(test)
predictionrf.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(4675.0, 459.0, 81673.0, 0.0, 0.0, 0.0, 2.0, 1...",0.0,"[8.767788859442819, 1.2322111405571827]","[0.8767788859442817, 0.12322111405571826]",0.0
1,"(179.0, 1559.0, 2855.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[7.754950224124448, 2.245049775875551]","[0.7754950224124448, 0.2245049775875551]",0.0
2,"(49.0, 2402.0, 3939.0, 0.0, 0.0, 0.0, 4.0, 0.0...",1.0,"[6.960469575464311, 3.03953042453569]","[0.6960469575464311, 0.30395304245356897]",0.0


In [24]:
evaluatorrf = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
rf = evaluatorrf.evaluate(predictionrf)

print("--- Random Forest Tree --- ")
print("Accuracy Rate =", round(rf,4))
print("  Error  Rate = %g " % round((1.0 - rf),4))

--- Random Forest Tree --- 
Accuracy Rate = 0.9747
  Error  Rate = 0.0253 


In [25]:
predictionAndLabel = predictionrf.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+
| normal|anomaly|
+-------+-------+
|20018.0|   58.0|
|  898.0|16868.0|
+-------+-------+



In [26]:
predictionAndLabels = predictionrf.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix()

DenseMatrix(2, 2, [20018.0, 898.0, 58.0, 16868.0], 0)

In [27]:
predictionrf.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+-----+
|prediction_label|  0.0|  1.0|
+----------------+-----+-----+
|             1.0|   58|16868|
|             0.0|20018|  898|
+----------------+-----+-----+



In [48]:
predictionrfevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
predictionrfevaluator.evaluate(predictionrf)      

predictionrf.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|16868|
|  0.0|       1.0|   58|
|  1.0|       0.0|  898|
|  0.0|       0.0|20018|
+-----+----------+-----+



### 3.3 Naive Bayes Algorithm

In [49]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [50]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [51]:
modelnb = nb.fit(train)
predictionnb = modelnb.transform(test)
predictionnb.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(4675.0, 459.0, 81673.0, 0.0, 0.0, 0.0, 2.0, 1...",0.0,"[-138859.03499841096, -103422.87369832343]","[0.0, 1.0]",1.0
1,"(179.0, 1559.0, 2855.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[-5546.76759700956, -4685.807149256234]","[0.0, 1.0]",1.0
2,"(49.0, 2402.0, 3939.0, 0.0, 0.0, 0.0, 4.0, 0.0...",1.0,"[-6716.193222195983, -5349.711133564788]","[0.0, 1.0]",1.0


In [52]:
evaluatornb = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
nb = evaluatornb.evaluate(predictionnb)

print("--- Naive Bayes --- ")
print("Accuracy Rate =", round(nb,4))
print("  Error  Rate = %g " % round((1.0 - nb),4))

--- Naive Bayes --- 
Accuracy Rate = 0.4152
  Error  Rate = 0.5848 


### 3.4 Gradient Boost Tree

In [53]:
from pyspark.ml.classification import GBTClassifier

In [54]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

In [55]:
modelgbt = gbt.fit(train)
predictiongbt = modelgbt.transform(test)
predictiongbt.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(4675.0, 459.0, 81673.0, 0.0, 0.0, 0.0, 2.0, 1...",0.0,"[0.8857438341786618, -0.8857438341786618]","[0.8546425796028204, 0.14535742039717958]",0.0
1,"(179.0, 1559.0, 2855.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[-0.4320190484593675, 0.4320190484593675]","[0.2964963624924221, 0.7035036375075778]",1.0
2,"(49.0, 2402.0, 3939.0, 0.0, 0.0, 0.0, 4.0, 0.0...",1.0,"[-0.7025987245847151, 0.7025987245847151]","[0.1969926499230797, 0.8030073500769204]",1.0


In [56]:
evaluatorgbt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt = evaluatorgbt.evaluate(predictiongbt)

print("--- Gradient Boost Tree --- ")
print("Accuracy Rate =", round(gbt,4))
print("  Error  Rate = %g " % round((1.0 - gbt),4))

--- Gradient Boost Tree --- 
Accuracy Rate = 0.9906
  Error  Rate = 0.0094 


### 3.5 Logistic Regression

In [57]:
from pyspark.ml.classification import LogisticRegression

In [58]:
lr = LogisticRegression(regParam=0.01)
modellr = lr.fit(train)

In [61]:
predictionlr = modellr.transform(test)
predictionlr.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(4675.0, 459.0, 81673.0, 0.0, 0.0, 0.0, 2.0, 1...",0.0,"[1.1948925592515842, -1.1948925592515842]","[0.7676149523516119, 0.23238504764838808]",0.0
1,"(179.0, 1559.0, 2855.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[0.0751404659195054, -0.0751404659195054]","[0.5187762829297493, 0.4812237170702507]",0.0
2,"(49.0, 2402.0, 3939.0, 0.0, 0.0, 0.0, 4.0, 0.0...",1.0,"[-4.980352689988514, 4.980352689988514]","[0.0068247414155798705, 0.99317525858442]",1.0


In [62]:
evaluatorlr = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
lr = evaluatorlr.evaluate(predictionlr)

In [63]:
print("--- Logistic Regression --- ")
print("Accuracy Rate =", round(lr,4))
print("  Error  Rate = %g " % round((1.0 - lr),4))

--- Logistic Regression --- 
Accuracy Rate = 0.9436
  Error  Rate = 0.0564 


## 3.6 Comparison of Accucary  Rate of Algorithms

In [64]:
print("Gradient Boost Tree Accuracy =", round(gbt,5))
print("      Decision Tree Accuracy =", round(dt,5))
print(" Random Forest Tree Accuracy =", round(rf,5))
print("Logistic Regression Accuracy =", round(lr,5))
print("        Naive Bayes Accuracy =", round(nb,5))

Gradient Boost Tree Accuracy = 0.99059
      Decision Tree Accuracy = 0.99022
 Random Forest Tree Accuracy = 0.97474
Logistic Regression Accuracy = 0.94363
        Naive Bayes Accuracy = 0.41517


# 4. Streaming Process

In [66]:
schema = StructType(
[
    StructField("duration", FloatType(), True),
    StructField("src_bytes", FloatType(), True),
    StructField("dst_bytes", FloatType(), True),
    StructField("land", FloatType(), True),
    StructField("wrong_fragment", FloatType(), True),
    StructField("urgent", FloatType(), True),
    StructField("hot", FloatType(), True),
    StructField("num_failed_logins", FloatType(), True),
    StructField("logged_in", FloatType(), True),
    StructField("num_compromised", FloatType(), True),
    StructField("root_shell", FloatType(), True),
    StructField("su_attempted", FloatType(), True),
    StructField("num_root", FloatType(), True),
    StructField("num_file_creations", FloatType(), True),
    StructField("num_shells", FloatType(), True),
    StructField("num_access_files", FloatType(), True),
    StructField("num_outbound_cmds", FloatType(), True),
    StructField("is_host_login", FloatType(), True),
    StructField("is_guest_login", FloatType(), True),
    StructField("count", FloatType(), True),
    StructField("srv_count", FloatType(), True),
    StructField("serror_rate", FloatType(), True),
    StructField("srv_serror_rate", FloatType(), True),
    StructField("rerror_rate", FloatType(), True),
    StructField("srv_rerror_rate", FloatType(), True),
    StructField("same_srv_rate", FloatType(), True),
    StructField("diff_srv_rate", FloatType(), True),
    StructField("srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_count", FloatType(), True),
    StructField("dst_host_srv_count", FloatType(), True),
    StructField("dst_host_same_srv_rate", FloatType(), True),
    StructField("dst_host_diff_srv_rate", FloatType(), True),
    StructField("dst_host_same_src_port_rate", FloatType(), True),
    StructField("dst_host_srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_serror_rate", FloatType(), True),
    StructField("dst_host_srv_serror_rate", FloatType(), True),
    StructField("dst_host_rerror_rate", FloatType(), True),
    StructField("dst_host_srv_rerror_rate", FloatType(), True),
    
    StructField("status", StringType(), True)
])

In [67]:
iris_data = spark.readStream \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.schema(schema)\
.load("data")\

In [68]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

In [69]:
features_array = iris_data.selectExpr("""array(
CAST(duration AS FLOAT),
CAST(src_bytes AS FLOAT), 
CAST(dst_bytes AS FLOAT), 
CAST(land AS FLOAT),
CAST(wrong_fragment AS FLOAT), 
CAST(urgent AS FLOAT), 
CAST(hot AS FLOAT), 
CAST(num_failed_logins AS FLOAT), 
CAST(logged_in AS FLOAT),
CAST(num_compromised AS FLOAT), 
CAST(root_shell AS FLOAT),
CAST(su_attempted AS FLOAT), 
CAST(num_root AS FLOAT),
CAST(num_file_creations AS FLOAT), 
CAST(num_shells  AS FLOAT), 
CAST(num_access_files  AS FLOAT),
CAST(num_outbound_cmds  AS FLOAT), 
CAST(is_host_login  AS FLOAT),
CAST(is_guest_login  AS FLOAT), 
CAST(count AS FLOAT), 
CAST(srv_count AS FLOAT),
CAST(serror_rate AS FLOAT), 
CAST(srv_serror_rate AS FLOAT),
CAST(rerror_rate AS FLOAT), 
CAST(srv_rerror_rate AS FLOAT),
CAST(same_srv_rate AS FLOAT),
CAST(diff_srv_rate AS FLOAT), 
CAST(srv_diff_host_rate AS FLOAT),
CAST(dst_host_count AS FLOAT),
CAST(dst_host_srv_count AS FLOAT), 
CAST(dst_host_same_srv_rate AS FLOAT),
CAST(dst_host_diff_srv_rate AS FLOAT),
CAST(dst_host_same_src_port_rate AS FLOAT),
CAST(dst_host_srv_diff_host_rate AS FLOAT), 
CAST(dst_host_serror_rate AS FLOAT),
CAST(dst_host_srv_serror_rate AS FLOAT),
CAST(dst_host_rerror_rate AS FLOAT), 
CAST(dst_host_srv_rerror_rate AS FLOAT)

) as arr""", 
                                      "status")

In [70]:
tovec_udf = funcs.udf(lambda r: Vectors.dense(r), VectorUDT())

In [71]:
data_stream = features_array.withColumn("features", tovec_udf("arr"))

## 5. Prediction

### 5.1 Prediction of Streaming Data

In [100]:
prediction = modelrf.transform(data_stream)

In [101]:
type(prediction)

pyspark.sql.dataframe.DataFrame

In [102]:
prediction.printSchema()

root
 |-- arr: array (nullable = false)
 |    |-- element: float (containsNull = true)
 |-- status: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### 5.2 Sliding Window Time

In [103]:
currentTimeDf = prediction.withColumn("processingTime",funcs.current_timestamp())

# 6. Start Streaming

### 6.1 Option 1 - Confusion Matrix

In [104]:
confusion_matrix = currentTimeDf.groupBy(funcs.window("processingTime", "3 seconds", "1 seconds"),"status", "prediction")\
.count()\
.withColumn("prediction",funcs.when(funcs.col("prediction").isin(1.0), "anomaly")\
.otherwise("normal"))\
.orderBy("window")

In [105]:
q = confusion_matrix.writeStream\
.outputMode("complete")\
.format("console")\
.option("truncate", "false")\
.start()

In [None]:
q.awaitTermination()

### 6.2 Option 2 - Append

In [74]:
prediction = prediction.select("features","status","prediction")

In [75]:
q = prediction.writeStream\
.outputMode("append")\
.format("console")\
.start()

In [None]:
q.awaitTermination()