In [229]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, flatten
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [230]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "2G") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()
spark

In [231]:
# spark.stop()

In [232]:
df = spark.read.text("data.txt").toDF("Log")

In [233]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

# df.printSchema()
# df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    # print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)

+--------------------+------------+-------------+------------------+----------+
|                 Log|First Server|Second Server|Communication Type|Process ID|
+--------------------+------------+-------------+------------------+----------+
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         0|
|[S-1.2, null, Res...|       S-1.2|         null|          Response|         0|
|[null, S-1.3, Req...|        null|        S-1.3|           Request|         1|
|[S-1.3, S-48.2, R...|       S-1.3|       S-48.2|           Request|         1|
|[S-48.2, S-27.3, ...|      S-48.2|       S-27.3|           Request|         1|
|[S-27.3, S-48.2, ...|      S-27.3|       S-48.2|          Response|         1|
|[S-48.2, 

In [234]:
# Remove Process ID from request 
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df = df.withColumn('Log', log('Log')) 

In [235]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+--------------------+
|Process ID|                 Log|        First Server|       Second Server| FCommunication Type|
+----------+--------------------+--------------------+--------------------+--------------------+
|         7|[[null, S-1.1, Re...|[null, S-1.1, S-2...|[S-1.1, S-23.1, S...|[Request, Request...|
|        15|[[null, S-1.1, Re...|[null, S-1.1, S-3...|[S-1.1, S-38.1, S...|[Request, Request...|
|        11|[[null, S-1.3, Re...|[null, S-1.

In [236]:
df.show()

+--------------------+------------+-------------+------------------+----------+
|                 Log|First Server|Second Server|Communication Type|Process ID|
+--------------------+------------+-------------+------------------+----------+
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         0|
|[S-1.2, null, Res...|       S-1.2|         null|          Response|         0|
|[null, S-1.3, Req...|        null|        S-1.3|           Request|         1|
|[S-1.3, S-48.2, R...|       S-1.3|       S-48.2|           Request|         1|
|[S-48.2, S-27.3, ...|      S-48.2|       S-27.3|           Request|         1|
|[S-27.3, S-48.2, ...|      S-27.3|       S-48.2|          Response|         1|
|[S-48.2, S-1.3, R...|      S-48.2|        S-1.3|          Response|         1|
|[S-1.3, null, Res...|       S-1.3|         null|          Response|         1|
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         2|
|[S-1.2, null, Res...|       S-1.2|     

In [237]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+
|Process ID|        First Server|       Second Server|             Servers|
+----------+--------------------+--------------------+--------------------+
|         7|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|
|        15|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|
|        11|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|
|         3|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|
|        30|                [S7]|                [S8]|            [S7, S8]|
|         8|       [S-1.2, null]|       [S-1.2, null]|       [S-1.2, 

In [238]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

+----------+---------------------+
|Process ID|Characteristic Matrix|
+----------+---------------------+
|         7| (47,[0,3,4,5,29,4...|
|        15| (47,[0,3,13],[1.0...|
|        11| (47,[0,2,5,18,24,...|
|         3| (47,[0,2,12,21,22...|
|        30| (47,[6,8],[1.0,1.0])|
|         8| (47,[0,1],[1.0,1.0])|
|        16| (47,[0,3,36],[1.0...|
|         0| (47,[0,1],[1.0,1.0])|
|         5| (47,[0,2,15,17],[...|
|        18| (47,[0,2,14,43],[...|
|        17| (47,[0,2,10,23,25...|
|         6| (47,[0,1],[1.0,1.0])|
|        19| (47,[0,3,4,9,16,2...|
|         9| (47,[0,1,7,27,40]...|
|         1| (47,[0,2,7,33],[1...|
|        20| (47,[6,8],[1.0,1.0])|
|        10| (47,[0,3],[1.0,1.0])|
|         4| (47,[0,2,11],[1.0...|
|        12| (47,[0,1],[1.0,1.0])|
|        13| (47,[0,1,26],[1.0...|
+----------+---------------------+
only showing top 20 rows

Rows of Characteristic Matrix:  [

In [239]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

signatures.show()

+----------+---------------------+--------------------+
|Process ID|Characteristic Matrix|          Signatures|
+----------+---------------------+--------------------+
|         7| (47,[0,3,4,5,29,4...|[[6.3456052E7], [...|
|        15| (47,[0,3,13],[1.0...|[[3.92606065E8], ...|
|        11| (47,[0,2,5,18,24,...|[[2.60448184E8], ...|
|         3| (47,[0,2,12,21,22...|[[1.9478414E8], [...|
|        30| (47,[6,8],[1.0,1.0])|[[9.1874821E8], [...|
|         8| (47,[0,1],[1.0,1.0])|[[1.29949889E8], ...|
|        16| (47,[0,3,36],[1.0...|[[7.87420122E8], ...|
|         0| (47,[0,1],[1.0,1.0])|[[1.29949889E8], ...|
|         5| (47,[0,2,15,17],[...|[[7.87420122E8], ...|
|        18| (47,[0,2,14,43],[...|[[7.87420122E8], ...|
|        17| (47,[0,2,10,23,25...|[[1.29120096E8], ...|
|         6| (47,[0,1],[1.0,1.0])|[[1.29949889E8], ...|
|        19| (47,[0,3,4,9,16,2...|[[1.95613933E8], ...|
|         9| (47,[0,1,7,27,40]...|[[1.29949889E8], ...|
|         1| (47,[0,2,7,33],[1...|[[2.61277977E8

In [240]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.01, distCol="Jaccard Distance")
similar_pairs.show()

+--------------------+--------------------+----------------+
|            datasetA|            datasetB|Jaccard Distance|
+--------------------+--------------------+----------------+
|{2, (47,[0,1],[1....|{0, [[1.29949889E...|             0.0|
|{6, (47,[0,1],[1....|{2, [[1.29949889E...|             0.0|
|{2, (47,[0,1],[1....|{2, [[1.29949889E...|             0.0|
|{2, (47,[0,1],[1....|{8, [[1.29949889E...|             0.0|
|{0, (47,[0,1],[1....|{0, [[1.29949889E...|             0.0|
|{19, (47,[0,3,4,9...|{19, [[1.95613933...|             0.0|
|{12, (47,[0,1],[1...|{2, [[1.29949889E...|             0.0|
|{12, (47,[0,1],[1...|{0, [[1.29949889E...|             0.0|
|{11, (47,[0,2,5,1...|{11, [[2.60448184...|             0.0|
|{6, (47,[0,1],[1....|{12, [[1.29949889...|             0.0|
|{6, (47,[0,1],[1....|{6, [[1.29949889E...|             0.0|
|{2, (47,[0,1],[1....|{12, [[1.29949889...|             0.0|
|{12, (47,[0,1],[1...|{12, [[1.29949889...|             0.0|
|{5, (47,[0,2,15,1...|{5

In [241]:
similar_pairs.count()

44

In [242]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance") \
                    # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [243]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [244]:
similar_pairs.show()

+------------+------------+----------------+
|Process ID A|Process ID B|Jaccard Distance|
+------------+------------+----------------+
|           2|           0|             0.0|
|           6|           2|             0.0|
|           2|           2|             0.0|
|           2|           8|             0.0|
|           0|           0|             0.0|
|          19|          19|             0.0|
|          12|           2|             0.0|
|          12|           0|             0.0|
|          11|          11|             0.0|
|           6|          12|             0.0|
|           6|           6|             0.0|
|           2|          12|             0.0|
|          12|          12|             0.0|
|           5|           5|             0.0|
|           2|           6|             0.0|
|           6|           0|             0.0|
|          20|          20|             0.0|
|           0|           8|             0.0|
|          30|          30|             0.0|
|         

In [245]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [246]:
pairs.show()

+------------+------------+--------------------+--------------------+
|Process ID A|Process ID B|               Log A|               Log B|
+------------+------------+--------------------+--------------------+
|           2|           0|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           6|           2|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           2|           2|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           2|           8|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           0|           0|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|          19|          19|[[null, S-1.1, Re...|[[null, S-1.1, Re...|
|          12|           2|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|          12|           0|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|          11|          11|[[null, S-1.3, Re...|[[null, S-1.3, Re...|
|           6|          12|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           6|           6|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           2|      

In [247]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [248]:
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A"))

In [249]:
same_pairs.show()

+--------------------+-------------------------+
|               Log A|collect_set(Process ID A)|
+--------------------+-------------------------+
|[[null, S-1.3, Re...|                     [17]|
|[[null, S-1.1, Re...|                     [19]|
|[[null, S-1.3, Re...|                     [18]|
|[[null, S-1.2, Re...|         [12, 2, 8, 6, 0]|
|[[null, S-1.3, Re...|                     [11]|
|[[null, S-1.3, Re...|                      [4]|
|[[null, S-1.1, Re...|                     [16]|
|[[null, S-1.2, Re...|                      [9]|
|[[null, S-1.1, Re...|                     [10]|
|[[null, S-1.2, Re...|                     [14]|
|[[null, S-1.3, Re...|                      [3]|
|[[S7, S8, Response]]|                 [20, 30]|
|[[null, S-1.3, Re...|                      [5]|
|[[null, S-1.1, Re...|                      [7]|
|[[null, S-1.2, Re...|                     [13]|
|[[null, S-1.1, Re...|                     [15]|
|[[null, S-1.3, Re...|                      [1]|
+-------------------

# Shingling

In [250]:
def k_shingling(text, k):
    shingles = set()
    for i in range(len(text) - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return list(shingles)

k_shingling_udf = udf(lambda text: k_shingling(text, 5), ArrayType(StringType()))

In [251]:
df = spark.read.text("data.txt").toDF("Log")
log = udf(lambda x: x[1:-4], StringType()) 
df = df.withColumn('Log_split', log('Log')) 
df.collect()
df_shingles = df.withColumn("Shingles", k_shingling_udf(df["Log_split"]))
df_shingles.show()

+--------------------+--------------------+--------------------+
|                 Log|           Log_split|            Shingles|
+--------------------+--------------------+--------------------+
|<null, S-1.2, Req...|null, S-1.2, Request|[null,, l, S-, qu...|
|<S-1.2, null, Res...|S-1.2, null, Resp...|[2, nu, .2, n, l,...|
|<null, S-1.3, Req...|null, S-1.3, Request|[null,, l, S-, qu...|
|<S-1.3, S-48.2, R...|S-1.3, S-48.2, Re...|[ S-48, 3, S-, qu...|
|<S-48.2, S-27.3, ...|S-48.2, S-27.3, R...|[.2, S, eques, 2,...|
|<S-27.3, S-48.2, ...|S-27.3, S-48.2, R...|[3, S-, .3, S, es...|
|<S-48.2, S-1.3, R...|S-48.2, S-1.3, Re...|[.2, S, espon, 2,...|
|<S-1.3, null, Res...|S-1.3, null, Resp...|[ponse, l, Re, nu...|
|<null, S-1.2, Req...|null, S-1.2, Request|[null,, l, S-, qu...|
|<S-1.2, null, Res...|S-1.2, null, Resp...|[2, nu, .2, n, l,...|
|<null, S-1.3, Req...|null, S-1.3, Request|[null,, l, S-, qu...|
|<S-1.3, S-24.3, R...|S-1.3, S-24.3, Re...|[3, S-, , Req, qu...|
|<S-24.3, S-17.2, ...|S-2

In [252]:
df_shingles.collect()

[Row(Log='<null, S-1.2, Request, 0>', Log_split='null, S-1.2, Request', Shingles=['null,', 'l, S-', 'quest', '1.2, ', 'ull, ', ', S-1', '.2, R', '2, Re', ' Requ', 'S-1.2', 'eques', 'll, S', ' S-1.', '-1.2,', 'Reque', ', Req']),
 Row(Log='<S-1.2, null, Response, 0>', Log_split='S-1.2, null, Response', Shingles=['2, nu', '.2, n', 'l, Re', 'null,', ', nul', '1.2, ', 'ull, ', ' null', ' Resp', 'll, R', ', Res', 'espon', 'S-1.2', '-1.2,', 'spons', 'Respo', 'ponse']),
 Row(Log='<null, S-1.3, Request, 1>', Log_split='null, S-1.3, Request', Shingles=['null,', 'l, S-', 'quest', '1.3, ', 'ull, ', ', S-1', '-1.3,', ' Requ', 'eques', '3, Re', 'll, S', ' S-1.', 'Reque', '.3, R', ', Req', 'S-1.3']),
 Row(Log='<S-1.3, S-48.2, Request, 1>', Log_split='S-1.3, S-48.2, Request', Shingles=[' S-48', '3, S-', 'quest', '1.3, ', 'S-48.', ', S-4', '-1.3,', '.2, R', '.3, S', '2, Re', ' Requ', 'eques', '8.2, ', 'Reque', '48.2,', ', Req', 'S-1.3', '-48.2']),
 Row(Log='<S-48.2, S-27.3, Request, 1>', Log_split='S-4

In [253]:
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df_shingles = df_shingles.withColumn("Log", split(col("Log"), " "))

df_shingles.printSchema()
df_shingles.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df_shingles = df_shingles.withColumn(columns[i], col("Log")[i])

df_shingles.printSchema()
df_shingles.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Log_split: string (nullable = true)
 |-- Shingles: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+
|                 Log|           Log_split|            Shingles|
+--------------------+--------------------+--------------------+
|[null, S-1.2, Req...|null, S-1.2, Request|[null,, l, S-, qu...|
|[S-1.2, null, Res...|S-1.2, null, Resp...|[2, nu, .2, n, l,...|
|[null, S-1.3, Req...|null, S-1.3, Request|[null,, l, S-, qu...|
|[S-1.3, S-48.2, R...|S-1.3, S-48.2, Re...|[ S-48, 3, S-, qu...|
|[S-48.2, S-27.3, ...|S-48.2, S-27.3, R...|[.2, S, eques, 2,...|
|[S-27.3, S-48.2, ...|S-27.3, S-48.2, R...|[3, S-, .3, S, es...|
|[S-48.2, S-1.3, R...|S-48.2, S-1.3, Re...|[.2, S, espon, 2,...|
|[S-1.3, null, Res...|S-1.3, null, Resp...|[ponse, l, Re, nu...|
|[null, S-1.2, Req...|null, S-1.2, Request|[null,, l, S-, qu...|
|[S

In [270]:
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df_shingles = df_shingles.withColumn('Log', log('Log')) 

In [272]:
grouped_df = df_shingles.groupBy("Process ID").agg(collect_set("Shingles").alias("Shingles"),collect_list("Log").alias("Log"))
grouped_df = grouped_df.withColumn("Flat shingles", flatten(col("Shingles")))
grouped_df.printSchema()

root
 |-- Process ID: string (nullable = true)
 |-- Shingles: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Flat shingles: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [273]:
characteristics = CountVectorizer(inputCol="Flat shingles", outputCol="Characteristic Matrix")

model = characteristics.fit(grouped_df)
char_matrix = model.transform(grouped_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
char_matrix.show()

shingles = model.vocabulary
print("Rows of Characteristic Matrix: ", shingles)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

+----------+---------------------+
|Process ID|Characteristic Matrix|
+----------+---------------------+
|         7| (229,[0,1,2,3,4,5...|
|        15| (229,[0,1,2,3,4,5...|
|        11| (229,[0,1,2,3,4,5...|
|         3| (229,[0,1,2,3,4,5...|
|        30| (229,[0,1,2,3,4,5...|
|         8| (229,[0,1,2,3,4,5...|
|        16| (229,[0,1,2,3,4,5...|
|         0| (229,[0,1,2,3,4,5...|
|         5| (229,[0,1,2,3,4,5...|
|        18| (229,[0,1,2,3,4,5...|
|        17| (229,[0,1,2,3,4,5...|
|         6| (229,[0,1,2,3,4,5...|
|        19| (229,[0,1,2,3,4,5...|
|         9| (229,[0,1,2,3,4,5...|
|         1| (229,[0,1,2,3,4,5...|
|        20| (229,[0,1,2,3,4,5...|
|        10| (229,[0,1,2,3,4,5...|
|         4| (229,[0,1,2,3,4,5...|
|        12| (229,[0,1,2,3,4,5...|
|        13| (229,[0,1,2,3,4,5...|
+----------+---------------------+
only showing top 20 rows

Rows of Characteristic Matrix:  [

In [274]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

signatures.show()

+----------+---------------------+--------------------+
|Process ID|Characteristic Matrix|          Signatures|
+----------+---------------------+--------------------+
|         7| (229,[0,1,2,3,4,5...|[[6.0966673E7], [...|
|        15| (229,[0,1,2,3,4,5...|[[6.3456052E7], [...|
|        11| (229,[0,1,2,3,4,5...|[[6.1796466E7], [...|
|         3| (229,[0,1,2,3,4,5...|[[5.8477294E7], [...|
|        30| (229,[0,1,2,3,4,5...|[[1.29949889E8], ...|
|         8| (229,[0,1,2,3,4,5...|[[1.29120096E8], ...|
|        16| (229,[0,1,2,3,4,5...|[[6.3456052E7], [...|
|         0| (229,[0,1,2,3,4,5...|[[1.29120096E8], ...|
|         5| (229,[0,1,2,3,4,5...|[[1.29949889E8], ...|
|        18| (229,[0,1,2,3,4,5...|[[6.013688E7], [3...|
|        17| (229,[0,1,2,3,4,5...|[[1.29949889E8], ...|
|         6| (229,[0,1,2,3,4,5...|[[1.29120096E8], ...|
|        19| (229,[0,1,2,3,4,5...|[[5.9307087E7], [...|
|         9| (229,[0,1,2,3,4,5...|[[1.29120096E8], ...|
|         1| (229,[0,1,2,3,4,5...|[[5.9307087E7]

In [288]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.2, distCol="Jaccard Distance")
similar_pairs.show()

+--------------------+--------------------+-------------------+
|            datasetA|            datasetB|   Jaccard Distance|
+--------------------+--------------------+-------------------+
|{12, (229,[0,1,2,...|{0, [[1.29120096E...|0.06666666666666665|
|{11, (229,[0,1,2,...|{11, [[6.1796466E...|                0.0|
|{2, (229,[0,1,2,3...|{8, [[1.29120096E...|                0.0|
|{9, (229,[0,1,2,3...|{9, [[1.29120096E...|                0.0|
|{18, (229,[0,1,2,...|{18, [[6.013688E7...|                0.0|
|{15, (229,[0,1,2,...|{15, [[6.3456052E...|                0.0|
|{5, (229,[0,1,2,3...|{5, [[1.29949889E...|                0.0|
|{4, (229,[0,1,2,3...|{4, [[1.29949889E...|                0.0|
|{6, (229,[0,1,2,3...|{2, [[1.29120096E...|                0.0|
|{12, (229,[0,1,2,...|{2, [[1.29120096E...|0.06666666666666665|
|{13, (229,[0,1,2,...|{13, [[1.29120096...|                0.0|
|{12, (229,[0,1,2,...|{6, [[1.29120096E...|0.06666666666666665|
|{30, (229,[0,1,2,...|{30, [[1.29949889.

In [300]:
similar_pairs.count()

44

In [290]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance")\
                        # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [291]:
similar_pairs.show()

+----------+----------+-------------------+
|Process ID|Process ID|   Jaccard Distance|
+----------+----------+-------------------+
|        12|         0|0.06666666666666665|
|        11|        11|                0.0|
|         2|         8|                0.0|
|         9|         9|                0.0|
|        18|        18|                0.0|
|        15|        15|                0.0|
|         5|         5|                0.0|
|         4|         4|                0.0|
|         6|         2|                0.0|
|        12|         2|0.06666666666666665|
|        13|        13|                0.0|
|        12|         6|0.06666666666666665|
|        30|        30|                0.0|
|        20|        30|                0.0|
|         2|         2|                0.0|
|         7|         7|                0.0|
|        12|         8|0.06666666666666665|
|         6|         6|                0.0|
|         6|         8|                0.0|
|         8|         6|         

In [292]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [293]:
similar_pairs.show()

+------------+------------+-------------------+
|Process ID A|Process ID B|   Jaccard Distance|
+------------+------------+-------------------+
|          12|           0|0.06666666666666665|
|          11|          11|                0.0|
|           2|           8|                0.0|
|           9|           9|                0.0|
|          18|          18|                0.0|
|          15|          15|                0.0|
|           5|           5|                0.0|
|           4|           4|                0.0|
|           6|           2|                0.0|
|          12|           2|0.06666666666666665|
|          13|          13|                0.0|
|          12|           6|0.06666666666666665|
|          30|          30|                0.0|
|          20|          30|                0.0|
|           2|           2|                0.0|
|           7|           7|                0.0|
|          12|           8|0.06666666666666665|
|           6|           6|             

In [294]:
grouped_df.show()

+----------+--------------------+--------------------+--------------------+
|Process ID|            Shingles|                 Log|       Flat shingles|
+----------+--------------------+--------------------+--------------------+
|         7|[[ S-23, 1.1, , ....|[[null, S-1.1, Re...|[ S-23, 1.1, , .1...|
|        15|[[.1, n, 1.1, , l...|[[null, S-1.1, Re...|[.1, n, 1.1, , l,...|
|        11|[[onse,, ponse, l...|[[null, S-1.3, Re...|[onse,, ponse, l,...|
|         3|[[3, S-, -28.3, ....|[[null, S-1.3, Re...|[3, S-, -28.3, .3...|
|        30|[[, S8,, onse,,  ...|[[S7, S8, Response]]|[, S8,, onse,,  R...|
|         8|[[null,, l, S-, q...|[[null, S-1.2, Re...|[null,, l, S-, qu...|
|        16|[[.1, n, 1.1, , l...|[[null, S-1.1, Re...|[.1, n, 1.1, , l,...|
|         0|[[null,, l, S-, q...|[[null, S-1.2, Re...|[null,, l, S-, qu...|
|         5|[[3, S-, quest, 1...|[[null, S-1.3, Re...|[3, S-, quest, 1....|
|        18|[[onse,, ponse, l...|[[null, S-1.3, Re...|[onse,, ponse, l,...|
|        17|

In [295]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [296]:
pairs.collect()

[Row(Process ID A='12', Process ID B='0', Log A=[['null', 'S-1.2', 'Request'], ['S-1.2', 'null', 'Response']], Log B=[['null', 'S-1.2', 'Request'], ['S-1.2', 'null', 'Response']]),
 Row(Process ID A='11', Process ID B='11', Log A=[['null', 'S-1.3', 'Request'], ['S-1.3', 'S-44.4', 'Request'], ['S-44.4', 'S-36.2', 'Request'], ['S-36.2', 'S-20.2', 'Request'], ['S-20.2', 'S-36.2', 'Response'], ['S-36.2', 'S-44.4', 'Response'], ['S-44.4', 'S-19.4', 'Request'], ['S-19.4', 'S-2.3', 'Request'], ['S-2.3', 'S-19.4', 'Response'], ['S-19.4', 'S-44.4', 'Response'], ['S-44.4', 'S-9.3', 'Request'], ['S-9.3', 'S-44.4', 'Response'], ['S-44.4', 'S-1.3', 'Response'], ['S-1.3', 'null', 'Response']], Log B=[['null', 'S-1.3', 'Request'], ['S-1.3', 'S-44.4', 'Request'], ['S-44.4', 'S-36.2', 'Request'], ['S-36.2', 'S-20.2', 'Request'], ['S-20.2', 'S-36.2', 'Response'], ['S-36.2', 'S-44.4', 'Response'], ['S-44.4', 'S-19.4', 'Request'], ['S-19.4', 'S-2.3', 'Request'], ['S-2.3', 'S-19.4', 'Response'], ['S-19.4',

In [297]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [298]:
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A"))

In [299]:
same_pairs.show()

+--------------------+-------------------------+
|               Log A|collect_set(Process ID A)|
+--------------------+-------------------------+
|[[null, S-1.3, Re...|                     [17]|
|[[null, S-1.1, Re...|                     [19]|
|[[null, S-1.3, Re...|                     [18]|
|[[null, S-1.2, Re...|         [12, 2, 8, 6, 0]|
|[[null, S-1.3, Re...|                     [11]|
|[[null, S-1.3, Re...|                      [4]|
|[[null, S-1.1, Re...|                     [16]|
|[[null, S-1.2, Re...|                      [9]|
|[[null, S-1.1, Re...|                     [10]|
|[[null, S-1.2, Re...|                     [14]|
|[[null, S-1.3, Re...|                      [3]|
|[[S7, S8, Response]]|                 [20, 30]|
|[[null, S-1.3, Re...|                      [5]|
|[[null, S-1.1, Re...|                      [7]|
|[[null, S-1.2, Re...|                     [13]|
|[[null, S-1.1, Re...|                     [15]|
|[[null, S-1.3, Re...|                      [1]|
+-------------------