### Importing Packages

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark.sql.functions as funcs
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.config("spark.driver.memory","3g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

In [4]:
'''logger = spark.sparkContext._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'''

'logger = spark.sparkContext._jvm.org.apache.log4j\nlogger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)\nlogger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'

## Predict on Stream

# 1. Load Dataset

In [5]:
schema = StructType(
[
    StructField("duration", FloatType(), True),
    StructField("src_bytes", FloatType(), True),
    StructField("dst_bytes", FloatType(), True),
    StructField("land", FloatType(), True),
    StructField("wrong_fragment", FloatType(), True),
    StructField("urgent", FloatType(), True),
    StructField("hot", FloatType(), True),
    StructField("num_failed_logins", FloatType(), True),
    StructField("logged_in", FloatType(), True),
    StructField("num_compromised", FloatType(), True),
    StructField("root_shell", FloatType(), True),
    StructField("su_attempted", FloatType(), True),
    StructField("num_root", FloatType(), True),
    StructField("num_file_creations", FloatType(), True),
    StructField("num_shells", FloatType(), True),
    StructField("num_access_files", FloatType(), True),
    StructField("num_outbound_cmds", FloatType(), True),
    StructField("is_host_login", FloatType(), True),
    StructField("is_guest_login", FloatType(), True),
    StructField("count", FloatType(), True),
    StructField("srv_count", FloatType(), True),
    StructField("serror_rate", FloatType(), True),
    StructField("srv_serror_rate", FloatType(), True),
    StructField("rerror_rate", FloatType(), True),
    StructField("srv_rerror_rate", FloatType(), True),
    StructField("same_srv_rate", FloatType(), True),
    StructField("diff_srv_rate", FloatType(), True),
    StructField("srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_count", FloatType(), True),
    StructField("dst_host_srv_count", FloatType(), True),
    StructField("dst_host_same_srv_rate", FloatType(), True),
    StructField("dst_host_diff_srv_rate", FloatType(), True),
    StructField("dst_host_same_src_port_rate", FloatType(), True),
    StructField("dst_host_srv_diff_host_rate", FloatType(), True),
    StructField("dst_host_serror_rate", FloatType(), True),
    StructField("dst_host_srv_serror_rate", FloatType(), True),
    StructField("dst_host_rerror_rate", FloatType(), True),
    StructField("dst_host_srv_rerror_rate", FloatType(), True),
    
    StructField("status", StringType(), True)
])

In [6]:
iris = spark.read \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.option("inferSchema", "True")\
.load("TrainDf.csv")

In [7]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

# 2. Data Preparation for Training

In [8]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [9]:
#train, test = data.randomSplit([0.70, 0.30])

In [10]:
label_indexer = StringIndexer(inputCol = "status", outputCol = "label")

label_indexer_model = label_indexer.fit(iris)
new_df = label_indexer_model.transform(iris)

In [11]:
feature_cols = iris.columns[:-1]

In [12]:
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
assembler_df = assembler.transform(new_df)

In [13]:
normal = assembler_df.where(funcs.col("status") == "normal")

# 3. Train Model (K-Means Clustering)

### 3.1 Training of Data

In [14]:
from pyspark.ml.clustering import KMeans

In [15]:
kmeans = KMeans(featuresCol='features',k=5, predictionCol="cluster",maxIter=100)

In [16]:
model = kmeans.fit(normal)

In [17]:
wssse_k3 = model.computeCost(normal)

In [18]:
predictions = model.transform(assembler_df)

In [19]:
predictions = predictions.select("features","label","cluster")

In [20]:
#predictions.toPandas().head()

### 3.2 Analysing of Trained model

In [21]:
predictions.select("cluster", "label").groupBy("cluster", "label").count()\
.orderBy("cluster", "label", ascending=True).toPandas().head(15)

Unnamed: 0,cluster,label,count
0,0,0.0,67102
1,0,1.0,58559
2,1,0.0,1
3,1,1.0,7
4,2,0.0,7
5,2,1.0,61
6,3,0.0,6
7,3,1.0,2
8,4,0.0,227
9,4,1.0,1


In [22]:
predictions.select("cluster", "label")\
.groupBy("cluster", "label").count()\
.orderBy("cluster", "label", ascending=True).withColumn("status",
funcs.when(funcs.col("label").isin(1), "Anomaly")\
.otherwise("Normal")).toPandas().head()

Unnamed: 0,cluster,label,count,status
0,0,0.0,67102,Normal
1,0,1.0,58559,Anomaly
2,1,0.0,1,Normal
3,1,1.0,7,Anomaly
4,2,0.0,7,Normal


### 3.3 Calculation of centroids

In [26]:
train_clusters = model.clusterCenters()

traind_clusters = {int(i):[float(train_clusters[i][j]) for j in range(len(train_clusters[i]))] 
              for i in range(len(train_clusters))}
train_clusters

[array([1.68645286e+02, 1.94539619e+03, 4.11033179e+03, 1.04318798e-04,
        0.00000000e+00, 1.49026855e-04, 2.31483413e-01, 1.38594975e-03,
        7.10500432e-01, 5.08896903e-01, 2.04166791e-03, 2.05657059e-03,
        5.64945903e-01, 2.23540282e-02, 6.11010104e-04, 7.52585616e-03,
        0.00000000e+00, 1.49026855e-05, 1.30100444e-02, 2.25925457e+01,
        2.77788889e+01, 1.34171858e-02, 1.20522488e-02, 4.43547137e-02,
        4.47894251e-02, 9.69341599e-01, 2.87268636e-02, 1.26571041e-01,
        1.47523427e+02, 1.90758383e+02, 8.13592143e-01, 3.99375577e-02,
        1.21030074e-01, 2.60054842e-02, 1.39331167e-02, 6.11770141e-03,
        4.67299037e-02, 4.48532085e-02]),
 array([5.158000e+03, 8.958152e+07, 7.028652e+06, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 1.00

In [27]:
train_df_centers = spark.sparkContext.parallelize([(k,)+(v,) for k,v in traind_clusters.items()]).toDF(['prediction','center'])
train_df_centers.toPandas().head()

Unnamed: 0,prediction,center
0,0,"[168.64528628058775, 1945.3961878930584, 4110...."
1,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,..."
2,2,"[2519.0, 7513037.142857143, 560605.1428571428,..."
3,3,"[745.8333333333333, 18749591.5, 491326.6666666..."
4,4,"[41.757709251101325, 2199225.2246696036, 8201...."


In [28]:
train_pred_df = predictions.withColumn('prediction',funcs.col('cluster').cast(IntegerType()))
train_pred_df.toPandas().head()

Unnamed: 0,features,label,cluster,prediction
0,"(0.0, 491.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,0
1,"(0.0, 146.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0,0
3,"(0.0, 232.0, 8153.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0,0
4,"(0.0, 199.0, 420.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1...",0.0,0,0


In [29]:
train_pred_df = train_pred_df.join(train_df_centers,on='prediction',how='left')
train_pred_df.toPandas().head()

Unnamed: 0,prediction,features,label,cluster,center
0,0,"(0.0, 491.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110...."
1,0,"(0.0, 146.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110...."
2,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0,"[168.64528628058775, 1945.3961878930584, 4110...."
3,0,"(0.0, 232.0, 8153.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110...."
4,0,"(0.0, 199.0, 420.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110...."


### 3.4 Finding Anomaly Values

In [31]:
get_dist = funcs.udf(lambda features, center : 
                 float(features.squared_distance(center)),FloatType())

In [32]:
train_pred_df = train_pred_df.withColumn('dist',get_dist(funcs.col('features'),funcs.col('center')))
train_pred_df.toPandas().sort_values(by="dist",ascending=False).head(20)

Unnamed: 0,prediction,features,label,cluster,center,dist
125665,1,"(10999.0, 0.0, 1309937401.0, 0.0, 0.0, 0.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.705596e+18
125668,1,"(36071.0, 1379963888.0, 0.0, 0.0, 0.0, 0.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.665136e+18
125663,1,"(34578.0, 1167519497.0, 0.0, 0.0, 0.0, 0.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.162e+18
125664,1,"(2.0, 693375640.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",3.646167e+17
125662,1,"(38259.0, 621568663.0, 0.0, 0.0, 0.0, 0.0, 0.0...",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",2.830597e+17
125727,2,"(39869.0, 0.0, 400291060.0, 0.0, 0.0, 0.0, 0.0...",1.0,2,"[2519.0, 7513037.142857143, 560605.1428571428,...",1.598409e+17
125713,2,"(39930.0, 0.0, 400291060.0, 0.0, 0.0, 0.0, 0.0...",1.0,2,"[2519.0, 7513037.142857143, 560605.1428571428,...",1.598409e+17
125661,1,"(35682.0, 381709090.0, 0.0, 0.0, 0.0, 0.0, 0.0...",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",8.538792e+16
125666,1,"(35301.0, 217277339.0, 0.0, 0.0, 0.0, 0.0, 0.0...",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.635563e+16
125670,3,"(103.0, 24418776.0, 118160.0, 0.0, 0.0, 0.0, 0...",0.0,3,"[745.8333333333333, 18749591.5, 491326.6666666...",32278910000000.0


In [33]:
x = train_pred_df.filter(funcs.col("label") == 0.0)
x.toPandas().head()

Unnamed: 0,prediction,features,label,cluster,center,dist
0,0,"(0.0, 491.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",19067110.0
1,0,"(0.0, 146.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",20209468.0
2,0,"(0.0, 232.0, 8153.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",19326102.0
3,0,"(0.0, 199.0, 420.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",16712641.0
4,0,"(0.0, 287.0, 2251.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",6256914.5


In [34]:
averageDistance = train_pred_df.filter(funcs.col("label") == 0.0).groupBy("cluster")\
.agg(funcs.avg("dist").alias("avgDist"))
averageDistance.toPandas().head()

Unnamed: 0,cluster,avgDist
0,1,0.0
1,3,14408770000000.0
2,4,50520220000.0
3,2,3787586000000.0
4,0,3996437000.0


In [35]:
maxDistance = train_pred_df.filter(funcs.col("label") == 0.0).groupBy("cluster")\
.agg(funcs.max("dist").alias("maxDist"))
maxDistance.toPandas().head()

Unnamed: 0,cluster,maxDist
0,1,0.0
1,3,32278910000000.0
2,4,4658019000000.0
3,2,15101420000000.0
4,0,26318030000000.0


In [36]:
train_pred_df.toPandas().head(5)

Unnamed: 0,prediction,features,label,cluster,center,dist
0,0,"(0.0, 491.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",19067110.0
1,0,"(0.0, 146.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",20209468.0
2,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",20757094.0
3,0,"(0.0, 232.0, 8153.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",19326102.0
4,0,"(0.0, 199.0, 420.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1...",0.0,0,"[168.64528628058775, 1945.3961878930584, 4110....",16712641.0


In [39]:
anomalyDetection = train_pred_df.join(averageDistance, averageDistance.cluster == train_pred_df.cluster)

anomalyDetection.withColumn("detection", funcs.when(yeniii.dist > yeniii.avgDist, "Anomaly").otherwise("Normal")).toPandas().head(20)

Unnamed: 0,prediction,features,label,cluster,center,dist,cluster.1,avgDist,detection
0,1,"(35682.0, 381709090.0, 0.0, 0.0, 0.0, 0.0, 0.0...",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",8.538792e+16,1,0.0,Anomaly
1,1,"(38259.0, 621568663.0, 0.0, 0.0, 0.0, 0.0, 0.0...",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",2.830597e+17,1,0.0,Anomaly
2,1,"(34578.0, 1167519497.0, 0.0, 0.0, 0.0, 0.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.162e+18,1,0.0,Anomaly
3,1,"(2.0, 693375640.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",3.646167e+17,1,0.0,Anomaly
4,1,"(10999.0, 0.0, 1309937401.0, 0.0, 0.0, 0.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.705596e+18,1,0.0,Anomaly
5,1,"(35301.0, 217277339.0, 0.0, 0.0, 0.0, 0.0, 0.0...",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.635563e+16,1,0.0,Anomaly
6,1,"(5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",0.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",0.0,1,0.0,Normal
7,1,"(36071.0, 1379963888.0, 0.0, 0.0, 0.0, 0.0, 0....",1.0,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,...",1.665136e+18,1,0.0,Anomaly
8,3,"(31.0, 18828976.0, 33116.0, 0.0, 0.0, 0.0, 0.0...",0.0,3,"[745.8333333333333, 18749591.5, 491326.6666666...",216259400000.0,3,14408770000000.0,Normal
9,3,"(103.0, 24418776.0, 118160.0, 0.0, 0.0, 0.0, 0...",0.0,3,"[745.8333333333333, 18749591.5, 491326.6666666...",32278910000000.0,3,14408770000000.0,Anomaly


# 4. Streaming

In [40]:
iris_data = spark.readStream \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.schema(schema)\
.load("data")\

In [41]:
iris.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: integer (nullable = true)
 |-- num_file_creations: integer (nullable = true)
 |-- num_shells: integer (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- srv_count: integer (nullable = true)
 |-- serror_rate: double (nullable = true)
 |-- srv_serror_rate: double (nul

In [42]:
features_array = iris_data.selectExpr("""array(
CAST(duration AS FLOAT),
CAST(src_bytes AS FLOAT), 
CAST(dst_bytes AS FLOAT), 
CAST(land AS FLOAT),
CAST(wrong_fragment AS FLOAT), 
CAST(urgent AS FLOAT), 
CAST(hot AS FLOAT), 
CAST(num_failed_logins AS FLOAT), 
CAST(logged_in AS FLOAT),
CAST(num_compromised AS FLOAT), 
CAST(root_shell AS FLOAT),
CAST(su_attempted AS FLOAT), 
CAST(num_root AS FLOAT),
CAST(num_file_creations AS FLOAT), 
CAST(num_shells  AS FLOAT), 
CAST(num_access_files  AS FLOAT),
CAST(num_outbound_cmds  AS FLOAT), 
CAST(is_host_login  AS FLOAT),
CAST(is_guest_login  AS FLOAT), 
CAST(count AS FLOAT), 
CAST(srv_count AS FLOAT),
CAST(serror_rate AS FLOAT), 
CAST(srv_serror_rate AS FLOAT),
CAST(rerror_rate AS FLOAT), 
CAST(srv_rerror_rate AS FLOAT),
CAST(same_srv_rate AS FLOAT),
CAST(diff_srv_rate AS FLOAT), 
CAST(srv_diff_host_rate AS FLOAT),
CAST(dst_host_count AS FLOAT),
CAST(dst_host_srv_count AS FLOAT), 
CAST(dst_host_same_srv_rate AS FLOAT),
CAST(dst_host_diff_srv_rate AS FLOAT),
CAST(dst_host_same_src_port_rate AS FLOAT),
CAST(dst_host_srv_diff_host_rate AS FLOAT), 
CAST(dst_host_serror_rate AS FLOAT),
CAST(dst_host_srv_serror_rate AS FLOAT),
CAST(dst_host_rerror_rate AS FLOAT), 
CAST(dst_host_srv_rerror_rate AS FLOAT)

) as arr""", 
                                      "status")

In [43]:
tovec_udf = funcs.udf(lambda r: Vectors.dense(r), VectorUDT())

In [44]:
data_stream = features_array.withColumn("features", tovec_udf("arr"))

# 5. Prediction

### 5.1 Prediction of Streaming Data

In [45]:
streaming_prediction = model.transform(data_stream)

In [46]:
type(streaming_prediction)

pyspark.sql.dataframe.DataFrame

In [47]:
streaming_prediction.printSchema()

root
 |-- arr: array (nullable = false)
 |    |-- element: float (containsNull = true)
 |-- status: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- cluster: integer (nullable = false)



### 5.2 Finding Anomaly

In [48]:
l_clusters = model.clusterCenters()

In [49]:
d_clusters = {int(i):[float(l_clusters[i][j]) for j in range(len(l_clusters[i]))] 
              for i in range(len(l_clusters))}

In [50]:
df_centers = spark.sparkContext.parallelize([(k,)+(v,) for k,v in d_clusters.items()]).toDF(['prediction','center'])

In [51]:
df_centers.toPandas().head()

Unnamed: 0,prediction,center
0,0,"[168.64528628058775, 1945.3961878930584, 4110...."
1,1,"[5158.0, 89581520.0, 7028652.0, 0.0, 0.0, 0.0,..."
2,2,"[2519.0, 7513037.142857143, 560605.1428571428,..."
3,3,"[745.8333333333333, 18749591.5, 491326.6666666..."
4,4,"[41.757709251101325, 2199225.2246696036, 8201...."


In [52]:
df_pred = streaming_prediction.withColumn('prediction',funcs.col('cluster').cast(IntegerType()))

In [53]:
df_pred = df_pred.join(df_centers,on='prediction',how='left')

In [54]:
get_dist = funcs.udf(lambda features, center : 
                 float(features.squared_distance(center)),FloatType())

In [86]:
df_pred = df_pred.withColumn('dist',get_dist(funcs.col('features'),funcs.col('center')))
new_df = df_pred.join(maxDistance, maxDistance.cluster == df_pred.cluster)

In [59]:
detection = new_df.withColumn("detection", funcs.when(yenidf.dist > yenidf.maxDist, "Anomaly").otherwise("Normal"))
detection = detection.select("features", "prediction", "dist", "maxDist", "status", "detection")

## Sliding Windows Time

In [60]:
currentTimeDf = detection.withColumn("processingTime",funcs.current_timestamp())

In [67]:
currentTimeDf.printSchema()

root
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = false)
 |-- dist: float (nullable = true)
 |-- maxDist: float (nullable = true)
 |-- status: string (nullable = true)
 |-- detection: string (nullable = false)
 |-- processingTime: timestamp (nullable = false)



In [87]:
windowedCount = currentTimeDf.groupBy(funcs.window("processingTime", "4 seconds", "2 seconds"), "prediction", 
                                      "status", "detection", "dist", "maxDist")\
.avg("dist").orderBy("window")

### 5.3 Sorting Last 10 Anomaly

# 6. Start Streaming

### 6.1 Option 1 - Append

In [83]:
'''q = df_pred.writeStream\
.outputMode("append")\
.format("console")\
.option("truncate", "false")\
.start()'''

'q = df_pred.writeStream.outputMode("append").format("console").option("truncate", "false").start()'

In [84]:
q = windowedCount.writeStream\
.outputMode("complete")\
.format("console")\
.option("truncate", "false")\
.start()

In [None]:
q.awaitTermination()

### 6.2 Option 2 - Complete Aggregate Function

In [72]:
group = df_pred.groupBy(["cluster","status"]).count()\
.orderBy("cluster", "status", ascending=True)

In [73]:
q = group.writeStream\
.outputMode("complete")\
.format("console")\
.start()

In [None]:
q.awaitTermination()

### 6.3 Option 3 - Sliding Windows (Confusion Matrix)

In [88]:
confusion_matrix = detection.groupBy("status","detection").count().select("status", "detection","count")

In [92]:
confusion_matrix = currentTimeDf.groupBy(funcs.window("processingTime", "3 seconds", "1 seconds"),"status", "detection")\
.count().orderBy("window")

In [93]:
q = confusion_matrix.writeStream\
.outputMode("complete")\
.format("console")\
.option("truncate", "false")\
.start()

In [None]:
q.awaitTermination()