In [128]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, flatten
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [129]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "2G") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()
spark

In [130]:
# spark.stop()

In [131]:
df = spark.read.text("data.txt").toDF("Log")

In [132]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

# df.printSchema()
# df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    # print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)

+--------------------+------------+-------------+------------------+----------+
|                 Log|First Server|Second Server|Communication Type|Process ID|
+--------------------+------------+-------------+------------------+----------+
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         0|
|[S-1.2, null, Res...|       S-1.2|         null|          Response|         0|
|[null, S-1.3, Req...|        null|        S-1.3|           Request|         1|
|[S-1.3, S-48.2, R...|       S-1.3|       S-48.2|           Request|         1|
|[S-48.2, S-27.3, ...|      S-48.2|       S-27.3|           Request|         1|
|[S-27.3, S-48.2, ...|      S-27.3|       S-48.2|          Response|         1|
|[S-48.2, 

In [133]:
# Remove Process ID from request 
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df = df.withColumn('Log', log('Log')) 

In [134]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+--------------------+
|Process ID|                 Log|        First Server|       Second Server| FCommunication Type|
+----------+--------------------+--------------------+--------------------+--------------------+
|         7|[[null, S-1.1, Re...|[null, S-1.1, S-2...|[S-1.1, S-23.1, S...|[Request, Request...|
|        15|[[null, S-1.1, Re...|[null, S-1.1, S-3...|[S-1.1, S-38.1, S...|[Request, Request...|
|        11|[[null, S-1.3, Re...|[null, S-1.

In [135]:
df.show()

+--------------------+------------+-------------+------------------+----------+
|                 Log|First Server|Second Server|Communication Type|Process ID|
+--------------------+------------+-------------+------------------+----------+
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         0|
|[S-1.2, null, Res...|       S-1.2|         null|          Response|         0|
|[null, S-1.3, Req...|        null|        S-1.3|           Request|         1|
|[S-1.3, S-48.2, R...|       S-1.3|       S-48.2|           Request|         1|
|[S-48.2, S-27.3, ...|      S-48.2|       S-27.3|           Request|         1|
|[S-27.3, S-48.2, ...|      S-27.3|       S-48.2|          Response|         1|
|[S-48.2, S-1.3, R...|      S-48.2|        S-1.3|          Response|         1|
|[S-1.3, null, Res...|       S-1.3|         null|          Response|         1|
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         2|
|[S-1.2, null, Res...|       S-1.2|     

In [136]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+
|Process ID|        First Server|       Second Server|             Servers|
+----------+--------------------+--------------------+--------------------+
|         7|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|
|        15|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|
|        11|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|
|         3|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|
|        30|                [S7]|                [S8]|            [S7, S8]|
|         8|       [S-1.2, null]|       [S-1.2, null]|       [S-1.2, 

In [137]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

+----------+---------------------+
|Process ID|Characteristic Matrix|
+----------+---------------------+
|         7| (47,[0,3,4,5,29,4...|
|        15| (47,[0,3,13],[1.0...|
|        11| (47,[0,2,5,18,24,...|
|         3| (47,[0,2,12,21,22...|
|        30| (47,[6,8],[1.0,1.0])|
|         8| (47,[0,1],[1.0,1.0])|
|        16| (47,[0,3,36],[1.0...|
|         0| (47,[0,1],[1.0,1.0])|
|         5| (47,[0,2,15,17],[...|
|        18| (47,[0,2,14,43],[...|
|        17| (47,[0,2,10,23,25...|
|         6| (47,[0,1],[1.0,1.0])|
|        19| (47,[0,3,4,9,16,2...|
|         9| (47,[0,1,7,27,40]...|
|         1| (47,[0,2,7,33],[1...|
|        20| (47,[6,8],[1.0,1.0])|
|        10| (47,[0,3],[1.0,1.0])|
|         4| (47,[0,2,11],[1.0...|
|        12| (47,[0,1],[1.0,1.0])|
|        13| (47,[0,1,26],[1.0...|
+----------+---------------------+
only showing top 20 rows

Rows of Characteristic Matrix:  [

In [138]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

signatures.show()

+----------+---------------------+--------------------+
|Process ID|Characteristic Matrix|          Signatures|
+----------+---------------------+--------------------+
|         7| (47,[0,3,4,5,29,4...|[[6.3456052E7], [...|
|        15| (47,[0,3,13],[1.0...|[[3.92606065E8], ...|
|        11| (47,[0,2,5,18,24,...|[[2.60448184E8], ...|
|         3| (47,[0,2,12,21,22...|[[1.9478414E8], [...|
|        30| (47,[6,8],[1.0,1.0])|[[9.1874821E8], [...|
|         8| (47,[0,1],[1.0,1.0])|[[1.29949889E8], ...|
|        16| (47,[0,3,36],[1.0...|[[7.87420122E8], ...|
|         0| (47,[0,1],[1.0,1.0])|[[1.29949889E8], ...|
|         5| (47,[0,2,15,17],[...|[[7.87420122E8], ...|
|        18| (47,[0,2,14,43],[...|[[7.87420122E8], ...|
|        17| (47,[0,2,10,23,25...|[[1.29120096E8], ...|
|         6| (47,[0,1],[1.0,1.0])|[[1.29949889E8], ...|
|        19| (47,[0,3,4,9,16,2...|[[1.95613933E8], ...|
|         9| (47,[0,1,7,27,40]...|[[1.29949889E8], ...|
|         1| (47,[0,2,7,33],[1...|[[2.61277977E8

In [139]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.01, distCol="Jaccard Distance")
similar_pairs.show()

+--------------------+--------------------+----------------+
|            datasetA|            datasetB|Jaccard Distance|
+--------------------+--------------------+----------------+
|{2, (47,[0,1],[1....|{0, [[1.29949889E...|             0.0|
|{6, (47,[0,1],[1....|{2, [[1.29949889E...|             0.0|
|{2, (47,[0,1],[1....|{2, [[1.29949889E...|             0.0|
|{2, (47,[0,1],[1....|{8, [[1.29949889E...|             0.0|
|{0, (47,[0,1],[1....|{0, [[1.29949889E...|             0.0|
|{19, (47,[0,3,4,9...|{19, [[1.95613933...|             0.0|
|{12, (47,[0,1],[1...|{2, [[1.29949889E...|             0.0|
|{12, (47,[0,1],[1...|{0, [[1.29949889E...|             0.0|
|{11, (47,[0,2,5,1...|{11, [[2.60448184...|             0.0|
|{6, (47,[0,1],[1....|{12, [[1.29949889...|             0.0|
|{6, (47,[0,1],[1....|{6, [[1.29949889E...|             0.0|
|{2, (47,[0,1],[1....|{12, [[1.29949889...|             0.0|
|{12, (47,[0,1],[1...|{12, [[1.29949889...|             0.0|
|{5, (47,[0,2,15,1...|{5

In [140]:
similar_pairs.count()

44

In [141]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance") \
                    # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [142]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [143]:
similar_pairs.show()

+------------+------------+----------------+
|Process ID A|Process ID B|Jaccard Distance|
+------------+------------+----------------+
|           2|           0|             0.0|
|           6|           2|             0.0|
|           2|           2|             0.0|
|           2|           8|             0.0|
|           0|           0|             0.0|
|          19|          19|             0.0|
|          12|           2|             0.0|
|          12|           0|             0.0|
|          11|          11|             0.0|
|           6|          12|             0.0|
|           6|           6|             0.0|
|           2|          12|             0.0|
|          12|          12|             0.0|
|           5|           5|             0.0|
|           2|           6|             0.0|
|           6|           0|             0.0|
|          20|          20|             0.0|
|           0|           8|             0.0|
|          30|          30|             0.0|
|         

In [144]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [145]:
pairs.show()

+------------+------------+--------------------+--------------------+
|Process ID A|Process ID B|               Log A|               Log B|
+------------+------------+--------------------+--------------------+
|           2|           0|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           6|           2|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           2|           2|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           2|           8|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           0|           0|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|          19|          19|[[null, S-1.1, Re...|[[null, S-1.1, Re...|
|          12|           2|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|          12|           0|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|          11|          11|[[null, S-1.3, Re...|[[null, S-1.3, Re...|
|           6|          12|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           6|           6|[[null, S-1.2, Re...|[[null, S-1.2, Re...|
|           2|      

In [146]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [147]:
from pyspark.sql.functions import col, count
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A"))

In [148]:
same_pairs.show()

+--------------------+-------------------------+
|               Log A|collect_set(Process ID A)|
+--------------------+-------------------------+
|[[null, S-1.3, Re...|                     [17]|
|[[null, S-1.1, Re...|                     [19]|
|[[null, S-1.3, Re...|                     [18]|
|[[null, S-1.2, Re...|         [12, 2, 8, 6, 0]|
|[[null, S-1.3, Re...|                     [11]|
|[[null, S-1.3, Re...|                      [4]|
|[[null, S-1.1, Re...|                     [16]|
|[[null, S-1.2, Re...|                      [9]|
|[[null, S-1.1, Re...|                     [10]|
|[[null, S-1.2, Re...|                     [14]|
|[[null, S-1.3, Re...|                      [3]|
|[[S7, S8, Response]]|                 [20, 30]|
|[[null, S-1.3, Re...|                      [5]|
|[[null, S-1.1, Re...|                      [7]|
|[[null, S-1.2, Re...|                     [13]|
|[[null, S-1.1, Re...|                     [15]|
|[[null, S-1.3, Re...|                      [1]|
+-------------------

In [149]:
from pyspark.sql.types import IntegerType


# Shingling

In [150]:
def k_shingling(text, k):
    shingles = set()
    for i in range(len(text) - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return list(shingles)

k_shingling_udf = udf(lambda text: k_shingling(text, 10), ArrayType(StringType()))

In [151]:
df = spark.read.text("data.txt").toDF("Log")
df_shingles = df.withColumn("Shingles", k_shingling_udf(df["Log"]))
df_shingles.show()

+--------------------+--------------------+
|                 Log|            Shingles|
+--------------------+--------------------+
|<null, S-1.2, Req...|[, S-1.2, R, l, S...|
|<S-1.2, null, Res...|[S-1.2, nul, ll, ...|
|<null, S-1.3, Req...|[1.3, Reque, null...|
|<S-1.3, S-48.2, R...|[ S-48.2, R, 1.3,...|
|<S-48.2, S-27.3, ...|[-48.2, S-2, S-27...|
|<S-27.3, S-48.2, ...|[-27.3, S-4, 3, S...|
|<S-48.2, S-1.3, R...|[1.3, Respo, <S-4...|
|<S-1.3, null, Res...|[ll, Respon, espo...|
|<null, S-1.2, Req...|[, S-1.2, R, l, S...|
|<S-1.2, null, Res...|[S-1.2, nul, ll, ...|
|<null, S-1.3, Req...|[1.3, Reque, null...|
|<S-1.3, S-24.3, R...|[.3, S-24.3, S-24...|
|<S-24.3, S-17.2, ...|[17.2, Requ, 24.3...|
|<S-17.2, S-28.3, ...|[S-28.3, Re, 2, S...|
|<S-28.3, S-29.3, ...|[<S-28.3, S, 29.3...|
|<S-29.3, S-32.1, ...|[.1, Reques, <S-2...|
|<S-32.1, S-29.3, ...|[-29.3, Res, S-32...|
|<S-29.3, S-32.2, ...|[, S-32.2, , 3, S...|
|<S-32.2, S-29.3, ...|[-29.3, Res, <S-3...|
|<S-29.3, S-28.3, ...|[S-28.3, R

In [152]:
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df_shingles = df_shingles.withColumn("Log", split(col("Log"), " "))

df_shingles.printSchema()
df_shingles.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df_shingles = df_shingles.withColumn(columns[i], col("Log")[i])

df_shingles.printSchema()
df_shingles.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Shingles: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+
|                 Log|            Shingles|
+--------------------+--------------------+
|[null, S-1.2, Req...|[, S-1.2, R, l, S...|
|[S-1.2, null, Res...|[S-1.2, nul, ll, ...|
|[null, S-1.3, Req...|[1.3, Reque, null...|
|[S-1.3, S-48.2, R...|[ S-48.2, R, 1.3,...|
|[S-48.2, S-27.3, ...|[-48.2, S-2, S-27...|
|[S-27.3, S-48.2, ...|[-27.3, S-4, 3, S...|
|[S-48.2, S-1.3, R...|[1.3, Respo, <S-4...|
|[S-1.3, null, Res...|[ll, Respon, espo...|
|[null, S-1.2, Req...|[, S-1.2, R, l, S...|
|[S-1.2, null, Res...|[S-1.2, nul, ll, ...|
|[null, S-1.3, Req...|[1.3, Reque, null...|
|[S-1.3, S-24.3, R...|[.3, S-24.3, S-24...|
|[S-24.3, S-17.2, ...|[17.2, Requ, 24.3...|
|[S-17.2, S-28.3, ...|[S-28.3, Re, 2, S...|
|[S-28.3, S-29.3, ...|[<S-28.3, S, 29.3...|
|[S-29.3, S-32.1, ...|[.1, Reque

In [153]:
grouped_df = df_shingles.groupBy("Process ID").agg(collect_set("Shingles").alias("Shingles"))
grouped_df = grouped_df.withColumn("Flat shingles", flatten(col("Shingles")))
grouped_df.printSchema()

root
 |-- Process ID: string (nullable = true)
 |-- Shingles: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Flat shingles: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [154]:
characteristics = CountVectorizer(inputCol="Flat shingles", outputCol="Characteristic Matrix")

model = characteristics.fit(grouped_df)
char_matrix = model.transform(grouped_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
char_matrix.show()

shingles = model.vocabulary
print("Rows of Characteristic Matrix: ", shingles)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

+----------+---------------------+
|Process ID|Characteristic Matrix|
+----------+---------------------+
|         7| (900,[0,1,2,3,4,7...|
|        15| (900,[0,1,2,3,4,5...|
|        11| (900,[0,1,2,3,4,5...|
|         3| (900,[0,1,2,3,4,7...|
|        30| (900,[0,1,2,39,20...|
|         8| (900,[0,1,2,3,4,9...|
|        16| (900,[0,1,2,3,4,5...|
|         0| (900,[0,1,2,3,4,9...|
|         5| (900,[0,1,2,3,4,7...|
|        18| (900,[0,1,2,3,4,5...|
|        17| (900,[0,1,2,3,4,5...|
|         6| (900,[0,1,2,3,4,9...|
|        19| (900,[0,1,2,3,4,5...|
|         9| (900,[0,1,2,3,4,9...|
|         1| (900,[0,1,2,3,4,5...|
|        20| (900,[0,1,2,206,2...|
|        10| (900,[0,1,2,3,4,5...|
|         4| (900,[0,1,2,3,4,7...|
|        12| (900,[0,1,2,3,4,5...|
|        13| (900,[0,1,2,3,4,5...|
+----------+---------------------+
only showing top 20 rows

Rows of Characteristic Matrix:  [

In [155]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

signatures.show()

+----------+---------------------+--------------------+
|Process ID|Characteristic Matrix|          Signatures|
+----------+---------------------+--------------------+
|         7| (900,[0,1,2,3,4,7...|[[4.4370813E7], [...|
|        15| (900,[0,1,2,3,4,5...|[[6.1796466E7], [...|
|        11| (900,[0,1,2,3,4,5...|[[4.1051641E7], [...|
|         3| (900,[0,1,2,3,4,7...|[[4.8519778E7], [...|
|        30| (900,[0,1,2,39,20...|[[1.29949889E8], ...|
|         8| (900,[0,1,2,3,4,9...|[[1.29120096E8], ...|
|        16| (900,[0,1,2,3,4,5...|[[6.1796466E7], [...|
|         0| (900,[0,1,2,3,4,9...|[[1.29120096E8], ...|
|         5| (900,[0,1,2,3,4,7...|[[6.2626259E7], [...|
|        18| (900,[0,1,2,3,4,5...|[[6.2626259E7], [...|
|        17| (900,[0,1,2,3,4,5...|[[4.6030399E7], [...|
|         6| (900,[0,1,2,3,4,9...|[[5.3498536E7], [...|
|        19| (900,[0,1,2,3,4,5...|[[4.354102E7], [8...|
|         9| (900,[0,1,2,3,4,9...|[[5.7647501E7], [...|
|         1| (900,[0,1,2,3,4,5...|[[4.6860192E7]

In [156]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.5, distCol="Jaccard Distance")
similar_pairs.show()

+--------------------+--------------------+-------------------+
|            datasetA|            datasetB|   Jaccard Distance|
+--------------------+--------------------+-------------------+
|{12, (900,[0,1,2,...|{2, [[1.29120096E...| 0.2564102564102564|
|{30, (900,[0,1,2,...|{20, [[1.29949889...|              0.375|
|{0, (900,[0,1,2,3...|{6, [[5.3498536E7...|0.21621621621621623|
|{13, (900,[0,1,2,...|{13, [[5.183895E7...|                0.0|
|{8, (900,[0,1,2,3...|{0, [[1.29120096E...|0.21621621621621623|
|{5, (900,[0,1,2,3...|{5, [[6.2626259E7...|                0.0|
|{19, (900,[0,1,2,...|{19, [[4.354102E7...|                0.0|
|{30, (900,[0,1,2,...|{30, [[1.29949889...|                0.0|
|{2, (900,[0,1,2,3...|{6, [[5.3498536E7...|0.21621621621621623|
|{2, (900,[0,1,2,3...|{2, [[1.29120096E...|                0.0|
|{2, (900,[0,1,2,3...|{12, [[1.29120096...| 0.2564102564102564|
|{0, (900,[0,1,2,3...|{12, [[1.29120096...| 0.2564102564102564|
|{10, (900,[0,1,2,...|{10, [[6.1796466E.

In [157]:
similar_pairs.count()

44

In [158]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance").filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [159]:
similar_pairs.show()

+----------+----------+-------------------+
|Process ID|Process ID|   Jaccard Distance|
+----------+----------+-------------------+
|        12|         2| 0.2564102564102564|
|        30|        20|              0.375|
|         0|         6|0.21621621621621623|
|         8|         0|0.21621621621621623|
|         2|         6|0.21621621621621623|
|         2|        12| 0.2564102564102564|
|         0|        12| 0.2564102564102564|
|         2|         0|0.21621621621621623|
|         6|         2|0.21621621621621623|
|         8|         6|0.21621621621621623|
|         6|        12| 0.2564102564102564|
|         0|         2|0.21621621621621623|
|         0|         8|0.21621621621621623|
|         6|         0|0.21621621621621623|
|         8|         2|0.21621621621621623|
|        12|         0| 0.2564102564102564|
|         6|         8|0.21621621621621623|
|         8|        12| 0.2564102564102564|
|        12|         6| 0.2564102564102564|
|        12|         8| 0.256410