In [2]:
import findspark
findspark.init()

In [3]:
import pyspark.sql.functions as funcs
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.config("spark.driver.memory","3g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

In [5]:
'''logger = spark.sparkContext._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'''

'logger = spark.sparkContext._jvm.org.apache.log4j\nlogger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)\nlogger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'

## Predict on Stream

## 1. Load Dataset

In [6]:
schema = StructType(
[
    StructField("duration", FloatType(), True),
    StructField("src_bytes", FloatType(), True),
    StructField("dst_bytes", FloatType(), True),
    StructField("land", FloatType(), True),
    StructField("wrong_fragment", FloatType(), True),
    StructField("urgent", FloatType(), True),
    StructField("hot", FloatType(), True),
    StructField("num_failed_logins", FloatType(), True),
    StructField("logged_in", FloatType(), True),
    StructField("num_compromised", FloatType(), True),
    StructField("root_shell", FloatType(), True),
    StructField("su_attempted", FloatType(), True),
    StructField("num_root", FloatType(), True),
    StructField("num_file_creations", FloatType(), True),
    StructField("num_shells", FloatType(), True),
    StructField("num_access_files", FloatType(), True),
    StructField("num_outbound_cmds", FloatType(), True),
    StructField("is_host_login", FloatType(), True),
    StructField("is_guest_login", FloatType(), True),
    StructField("count", FloatType(), True),
    StructField("srv_count", FloatType(), True),
    StructField("serror_rate", FloatType(), True),
    StructField("srv_serror_rate", FloatType(), True),
    StructField("rerror_rate", FloatType(), True),
    StructField("srv_rerror_rate", FloatType(), True),
    StructField("same_srv_rate", FloatType(), True),
    StructField("diff_srv_rate", FloatType(), True),
    StructField("srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_count", FloatType(), True),
    StructField("dst_host_srv_count", FloatType(), True),
    StructField("dst_host_same_srv_rate", FloatType(), True),
    StructField("dst_host_diff_srv_rate", FloatType(), True),
    StructField("dst_host_same_src_port_rate", FloatType(), True),
    StructField("dst_host_srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_serror_rate", FloatType(), True),
    StructField("dst_host_srv_serror_rate", FloatType(), True),
    StructField("dst_host_rerror_rate", FloatType(), True),
    StructField("dst_host_srv_rerror_rate", FloatType(), True),
    
    StructField("status", StringType(), True)
])

In [7]:
iris = spark.read \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.option("inferSchema", "True")\
.load("TrainDf.csv")

In [8]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

## 2. Data Preparation 

In [9]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [10]:
feature_cols = iris.columns[:-1]

In [11]:
label_indexer = StringIndexer(inputCol = "status", outputCol = "label")

In [12]:
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')

In [13]:
pipe = Pipeline(stages=[assembler, label_indexer])
pipe_model = pipe.fit(iris)

In [14]:
data = pipe_model.transform(iris)

In [15]:
data = data.select("features","label")

In [16]:
train, test = data.randomSplit([0.70, 0.30])

## 3. Train Model

### 3.1 Decision Tree Algorithm

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

In [18]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
modeldt = dt.fit(train)
predictiondt = modeldt.transform(test)
predictiondt.toPandas().head()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(4675.0, 459.0, 81673.0, 0.0, 0.0, 0.0, 2.0, 1...",0.0,"[643.0, 38.0]","[0.9441997063142438, 0.055800293685756244]",0.0
1,"(583.0, 848.0, 25323.0, 0.0, 0.0, 0.0, 1.0, 0....",0.0,"[643.0, 38.0]","[0.9441997063142438, 0.055800293685756244]",0.0
2,"(179.0, 1559.0, 2855.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[643.0, 38.0]","[0.9441997063142438, 0.055800293685756244]",0.0
3,"(176.0, 1559.0, 2732.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[643.0, 38.0]","[0.9441997063142438, 0.055800293685756244]",0.0
4,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[643.0, 38.0]","[0.9441997063142438, 0.055800293685756244]",0.0


In [19]:
predictiondt.select("prediction", "label")\
.groupBy("prediction", "label").count()\
.orderBy("prediction", "label", ascending=True).withColumn("status",
funcs.when(funcs.col("label").isin(1), "Anomaly")\
.otherwise("Normal")).toPandas().head()

Unnamed: 0,prediction,label,count,status
0,0.0,0.0,20088,Normal
1,0.0,1.0,260,Anomaly
2,1.0,0.0,126,Normal
3,1.0,1.0,17550,Anomaly


In [20]:
predictiondt.groupBy(["label","prediction"]).count().toPandas().head()

Unnamed: 0,label,prediction,count
0,1.0,1.0,17550
1,0.0,1.0,126
2,1.0,0.0,260
3,0.0,0.0,20088


In [21]:
evaluatordt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
dt = evaluatordt.evaluate(predictiondt)

print("--- Decision Tree --- ")
print("Accuracy Rate =", round(dt,4))
print("  Error  Rate = %g " % round((1.0 - dt),4))

--- Decision Tree --- 
Accuracy Rate = 0.9898
  Error  Rate = 0.0102 


In [22]:
predictionAndLabel = predictiondt.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+
| normal|anomaly|
+-------+-------+
|20088.0|  126.0|
|  260.0|17550.0|
+-------+-------+



In [23]:
predictiondt.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+-----+
|prediction_label|  0.0|  1.0|
+----------------+-----+-----+
|             1.0|  126|17550|
|             0.0|20088|  260|
+----------------+-----+-----+



### 3.2 Random Forest Algorithm

In [37]:
from pyspark.ml.classification import RandomForestClassifier

In [38]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
modelrf = rf.fit(train)
predictionrf = modelrf.transform(test)
predictionrf.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(169.0, 1567.0, 2857.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[8.247522784475562, 1.7524772155244388]","[0.8247522784475562, 0.1752477215524439]",0.0
1,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[8.131008127129164, 1.868991872870836]","[0.8131008127129163, 0.1868991872870836]",0.0
2,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[8.701079153714979, 1.298920846285021]","[0.8701079153714979, 0.1298920846285021]",0.0


In [39]:
evaluatorrf = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
rf = evaluatorrf.evaluate(predictionrf)

print("--- Random Forest Tree --- ")
print("Accuracy Rate =", round(rf,4))
print("  Error  Rate = %g " % round((1.0 - rf),4))

--- Random Forest Tree --- 
Accuracy Rate = 0.9838
  Error  Rate = 0.0162 


In [40]:
predictionAndLabel = predictionrf.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+
| normal|anomaly|
+-------+-------+
|20253.0|   75.0|
|  540.0|16982.0|
+-------+-------+



In [41]:
predictionAndLabels = predictionrf.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix()

DenseMatrix(2, 2, [20253.0, 540.0, 75.0, 16982.0], 0)

In [42]:
predictionrf.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+-----+
|prediction_label|  0.0|  1.0|
+----------------+-----+-----+
|             1.0|   75|16982|
|             0.0|20253|  540|
+----------------+-----+-----+



In [43]:
predictionrfevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
predictionrfevaluator.evaluate(predictionrf)      

predictionrf.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|16982|
|  0.0|       1.0|   75|
|  1.0|       0.0|  540|
|  0.0|       0.0|20253|
+-----+----------+-----+



### 3.3 Naive Bayes Algorithm

In [44]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [45]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [46]:
modelnb = nb.fit(train)
predictionnb = modelnb.transform(test)
predictionnb.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(169.0, 1567.0, 2857.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[-5504.640635749174, -4206.538937024775]","[0.0, 1.0]",1.0
1,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[-22938.437987205856, -12823.5907556801]","[0.0, 1.0]",1.0
2,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[-6676.187862107826, -4961.939037685276]","[0.0, 1.0]",1.0


In [47]:
evaluatornb = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
nb = evaluatornb.evaluate(predictionnb)

print("--- Naive Bayes --- ")
print("Accuracy Rate =", round(nb,4))
print("  Error  Rate = %g " % round((1.0 - nb),4))

--- Naive Bayes --- 
Accuracy Rate = 0.3784
  Error  Rate = 0.6216 


### 3.4 Gradient Boost Tree

In [48]:
from pyspark.ml.classification import GBTClassifier

In [49]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

In [50]:
modelgbt = gbt.fit(train)
predictiongbt = modelgbt.transform(test)
predictiongbt.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(169.0, 1567.0, 2857.0, 0.0, 0.0, 0.0, 3.0, 0....",1.0,"[-0.1847023672247639, 0.1847023672247639]","[0.40868486653747305, 0.591315133462527]",1.0
1,"(454.0, 492.0, 14334.0, 0.0, 0.0, 0.0, 2.0, 0....",0.0,"[0.12861703597366536, -0.12861703597366536]","[0.5639562442780877, 0.43604375572191234]",0.0
2,"(53.0, 2628.0, 3860.0, 0.0, 0.0, 0.0, 3.0, 0.0...",1.0,"[-0.18583957620995079, 0.18583957620995079]","[0.4081353415406574, 0.5918646584593426]",1.0


In [51]:
evaluatorgbt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt = evaluatorgbt.evaluate(predictiongbt)

print("--- Gradient Boost Tree --- ")
print("Accuracy Rate =", round(gbt,4))
print("  Error  Rate = %g " % round((1.0 - gbt),4))

--- Gradient Boost Tree --- 
Accuracy Rate = 0.9915
  Error  Rate = 0.0085 


## -->  Comparison of Accucary  Rate of Algorithms

In [52]:
print("      Decision Tree Accuracy =", round(dt,5))
print(" Random Forest Tree Accuracy =", round(rf,5))
print("Gradient Boost Tree Accuracy =", round(gbt,5))
print("        Naive Bayes Accuracy =", round(nb,5))

      Decision Tree Accuracy = 0.98806
 Random Forest Tree Accuracy = 0.98375
Gradient Boost Tree Accuracy = 0.99152
        Naive Bayes Accuracy = 0.37836


# 4. Streaming

In [24]:
iris_data = spark.readStream \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.schema(schema)\
.load("data")\

In [25]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

In [26]:
features_array = iris_data.selectExpr("""array(
CAST(duration AS FLOAT),
CAST(src_bytes AS FLOAT), 
CAST(dst_bytes AS FLOAT), 
CAST(land AS FLOAT),
CAST(wrong_fragment AS FLOAT), 
CAST(urgent AS FLOAT), 
CAST(hot AS FLOAT), 
CAST(num_failed_logins AS FLOAT), 
CAST(logged_in AS FLOAT),
CAST(num_compromised AS FLOAT), 
CAST(root_shell AS FLOAT),
CAST(su_attempted AS FLOAT), 
CAST(num_root AS FLOAT),
CAST(num_file_creations AS FLOAT), 
CAST(num_shells  AS FLOAT), 
CAST(num_access_files  AS FLOAT),
CAST(num_outbound_cmds  AS FLOAT), 
CAST(is_host_login  AS FLOAT),
CAST(is_guest_login  AS FLOAT), 
CAST(count AS FLOAT), 
CAST(srv_count AS FLOAT),
CAST(serror_rate AS FLOAT), 
CAST(srv_serror_rate AS FLOAT),
CAST(rerror_rate AS FLOAT), 
CAST(srv_rerror_rate AS FLOAT),
CAST(same_srv_rate AS FLOAT),
CAST(diff_srv_rate AS FLOAT), 
CAST(srv_diff_host_rate AS FLOAT),
CAST(dst_host_count AS FLOAT),
CAST(dst_host_srv_count AS FLOAT), 
CAST(dst_host_same_srv_rate AS FLOAT),
CAST(dst_host_diff_srv_rate AS FLOAT),
CAST(dst_host_same_src_port_rate AS FLOAT),
CAST(dst_host_srv_diff_host_rate AS FLOAT), 
CAST(dst_host_serror_rate AS FLOAT),
CAST(dst_host_srv_serror_rate AS FLOAT),
CAST(dst_host_rerror_rate AS FLOAT), 
CAST(dst_host_srv_rerror_rate AS FLOAT)

) as arr""", 
                                      "status")

In [27]:
tovec_udf = funcs.udf(lambda r: Vectors.dense(r), VectorUDT())

In [28]:
data_stream = features_array.withColumn("features", tovec_udf("arr"))

## 5. Prediction

In [37]:
prediction = modeldt.transform(data_stream)

In [38]:
type(prediction)

pyspark.sql.dataframe.DataFrame

In [39]:
prediction.printSchema()

root
 |-- arr: array (nullable = false)
 |    |-- element: float (containsNull = true)
 |-- status: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### 5.1 Option 1 - Confusion Matrix

In [40]:
confusion_matrix = prediction.groupBy("status","prediction").count()\
.withColumn("prediction",funcs.when(funcs.col("prediction").isin(1.0), "anomaly")\
.otherwise("normal"))
confusion_matrix = confusion_matrix.select("status", "prediction","count")

In [41]:
q = confusion_matrix.writeStream\
.outputMode("complete")\
.format("console")\
.start()

In [None]:
q.awaitTermination()

### 5.2 Option 2 - Append

In [74]:
prediction = prediction.select("features","status","prediction")

In [75]:
q = prediction.writeStream\
.outputMode("append")\
.format("console")\
.start()

In [None]:
q.awaitTermination()