In [214]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors

In [215]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "2G") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()
spark

In [216]:
# spark.stop()

In [217]:
df = spark.read.text("data.txt").toDF("Log")

In [218]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

df.printSchema()
df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|                 Log|
+--------------------+
|[null, S-1.2, Req...|
|[S-1.2, null, Res...|
|[null, S-1.3, Req...|
|[S-1.3, S-48.2, R...|
|[S-48.2, S-27.3, ...|
|[S-27.3, S-48.2, ...|
|[S-48.2, S-1.3, R...|
|[S-1.3, null, Res...|
|[null, S-1.2, Req...|
|[S-1.2, null, Res...|
|[null, S-1.3, Req...|
|[S-1.3, S-24.3, R...|
|[S-24.3, S-17.2, ...|
|[S-17.2, S-28.3, ...|
|[S-28.3, S-29.3, ...|
|[S-29.3, S-32.1, ...|
|[S-32.1, S-29.3, ...|
|[S-29.3, S-32.2, ...|
|[S-32.2, S-29.3, ...|
|[S-29.3, S-28.3, ...|
+--------------------+
only showing top 20 rows

First Server
Second Server
Communication Type
Process ID
root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)

+-----

In [219]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = false)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+--------------------+
|Process ID|                 Log|        First Server|       Second Server| FCommunication Type|
+----------+--------------------+--------------------+--------------------+--------------------+
|         7|[[null, S-1.1, Re...|[null, S-1.1, S-2...|[S-1.1, S-23.1, S...|[Request, Request...|
|        15|[[null, S-1.1, Re...|[null, S-1.1, S-3...|[S-1.1, S-38.1, S...|[Request, Request...|
|        11|[[null, S-1.3, Re...|[null, S-1

In [220]:
df.show()

+--------------------+------------+-------------+------------------+----------+
|                 Log|First Server|Second Server|Communication Type|Process ID|
+--------------------+------------+-------------+------------------+----------+
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         0|
|[S-1.2, null, Res...|       S-1.2|         null|          Response|         0|
|[null, S-1.3, Req...|        null|        S-1.3|           Request|         1|
|[S-1.3, S-48.2, R...|       S-1.3|       S-48.2|           Request|         1|
|[S-48.2, S-27.3, ...|      S-48.2|       S-27.3|           Request|         1|
|[S-27.3, S-48.2, ...|      S-27.3|       S-48.2|          Response|         1|
|[S-48.2, S-1.3, R...|      S-48.2|        S-1.3|          Response|         1|
|[S-1.3, null, Res...|       S-1.3|         null|          Response|         1|
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         2|
|[S-1.2, null, Res...|       S-1.2|     

In [221]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+
|Process ID|        First Server|       Second Server|             Servers|
+----------+--------------------+--------------------+--------------------+
|         7|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|
|        15|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|
|        11|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|
|         3|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|
|         8|       [S-1.2, null]|       [S-1.2, null]|       [S-1.2, null]|
|        16|[S-1.1, S-23.3, n...|[S-1.1, S-23.3, n...|[S-1.1, S-23.3,

In [222]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

+----------+---------------------+
|Process ID|Characteristic Matrix|
+----------+---------------------+
|         7| (45,[0,3,4,5,27,3...|
|        15| (45,[0,3,11],[1.0...|
|        11| (45,[0,2,5,16,22,...|
|         3| (45,[0,2,10,19,20...|
|         8| (45,[0,1],[1.0,1.0])|
|        16| (45,[0,3,34],[1.0...|
|         0| (45,[0,1],[1.0,1.0])|
|         5| (45,[0,2,13,15],[...|
|        18| (45,[0,2,12,41],[...|
|        17| (45,[0,2,8,21,23,...|
|         6| (45,[0,1],[1.0,1.0])|
|        19| (45,[0,3,4,7,14,1...|
|         9| (45,[0,1,6,25,38]...|
|         1| (45,[0,2,6,31],[1...|
|        10| (45,[0,3],[1.0,1.0])|
|         4| (45,[0,2,9],[1.0,...|
|        12| (45,[0,1],[1.0,1.0])|
|        13| (45,[0,1,24],[1.0...|
|        14| (45,[0,1,17,26],[...|
|         2| (45,[0,1],[1.0,1.0])|
+----------+---------------------+

Rows of Characteristic Matrix:  ['null', 'S-1.2', 'S-1.3',

In [223]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Buckets", numHashTables=2)
model = minhash.fit(char_matrix)
buckets = model.transform(char_matrix)

buckets.show()

+----------+---------------------+--------------------+
|Process ID|Characteristic Matrix|             Buckets|
+----------+---------------------+--------------------+
|         7| (45,[0,3,4,5,27,3...|[[2.96260827E8], ...|
|        15| (45,[0,3,11],[1.0...|[[7.38071409E8], ...|
|        11| (45,[0,2,5,16,22,...|[[1.6949694E7], [...|
|         3| (45,[0,2,10,19,20...|[[2.87782109E8], ...|
|         8| (45,[0,1],[1.0,1.0])|[[2.42094344E8], ...|
|        16| (45,[0,3,34],[1.0...|[[1.79449143E8], ...|
|         0| (45,[0,1],[1.0,1.0])|[[2.42094344E8], ...|
|         5| (45,[0,2,13,15],[...|[[4.04593793E8], ...|
|        18| (45,[0,2,12,41],[...|[[7.38071409E8], ...|
|        17| (45,[0,2,8,21,23,...|[[1.2528266E8], [...|
|         6| (45,[0,1],[1.0,1.0])|[[2.42094344E8], ...|
|        19| (45,[0,3,4,7,14,1...|[[7.38071409E8], ...|
|         9| (45,[0,1,6,25,38]...|[[2.33615626E8], ...|
|         1| (45,[0,2,6,31],[1...|[[7.38071409E8], ...|
|        10| (45,[0,3],[1.0,1.0])|[[7.38071409E8

In [224]:
similar_pairs = model.approxSimilarityJoin(buckets, buckets, 1, distCol="JaccardDistance")
similar_pairs.show()

+--------------------+--------------------+-------------------+
|            datasetA|            datasetB|    JaccardDistance|
+--------------------+--------------------+-------------------+
|{6, (45,[0,1],[1....|{1, [[7.38071409E...|                0.8|
|{1, (45,[0,2,6,31...|{10, [[7.38071409...|                0.8|
|{0, (45,[0,1],[1....|{5, [[4.04593793E...|                0.8|
|{4, (45,[0,2,9],[...|{13, [[2.42094344...|                0.8|
|{4, (45,[0,2,9],[...|{12, [[2.42094344...|               0.75|
|{12, (45,[0,1],[1...|{2, [[2.42094344E...|                0.0|
|{13, (45,[0,1,24]...|{19, [[7.38071409...| 0.8888888888888888|
|{2, (45,[0,1],[1....|{9, [[2.33615626E...|                0.6|
|{8, (45,[0,1],[1....|{8, [[2.42094344E...|                0.0|
|{5, (45,[0,2,13,1...|{13, [[2.42094344...| 0.8333333333333334|
|{1, (45,[0,2,6,31...|{5, [[4.04593793E...| 0.6666666666666667|
|{12, (45,[0,1],[1...|{3, [[2.87782109E...| 0.8888888888888888|
|{6, (45,[0,1],[1....|{12, [[2.42094344.