In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, element_at
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors

In [2]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "2G") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()
spark

24/06/15 17:10:09 WARN Utils: Your hostname, abha-ThinkPad-P14s-Gen-4 resolves to a loopback address: 127.0.1.1; using 192.168.178.94 instead (on interface wlp2s0)
24/06/15 17:10:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/15 17:10:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# spark.stop()

In [4]:
df = spark.read.text("data.txt").toDF("Log")

In [5]:
df

DataFrame[Log: string]

In [6]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

df.printSchema()
df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|                 Log|
+--------------------+
|[null, S-1.2, Req...|
|[S-1.2, null, Res...|
|[null, S-1.3, Req...|
|[S-1.3, S-48.2, R...|
|[S-48.2, S-27.3, ...|
|[S-27.3, S-48.2, ...|
|[S-48.2, S-1.3, R...|
|[S-1.3, null, Res...|
|[null, S-1.2, Req...|
|[S-1.2, null, Res...|
|[null, S-1.3, Req...|
|[S-1.3, S-24.3, R...|
|[S-24.3, S-17.2, ...|
|[S-17.2, S-28.3, ...|
|[S-28.3, S-29.3, ...|
|[S-29.3, S-32.1, ...|
|[S-32.1, S-29.3, ...|
|[S-29.3, S-32.2, ...|
|[S-32.2, S-29.3, ...|
|[S-29.3, S-28.3, ...|
+--------------------+
only showing top 20 rows

First Server
Second Server
Communication Type
Process ID
root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)

+-----

In [7]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = false)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+--------------------+
|Process ID|                 Log|        First Server|       Second Server| FCommunication Type|
+----------+--------------------+--------------------+--------------------+--------------------+
|         7|[[null, S-1.1, Re...|[null, S-1.1, S-2...|[S-1.1, S-23.1, S...|[Request, Request...|
|        15|[[null, S-1.1, Re...|[null, S-1.1, S-3...|[S-1.1, S-38.1, S...|[Request, Request...|
|        11|[[null, S-1.3, Re...|[null, S-1

In [8]:
df.show()

+--------------------+------------+-------------+------------------+----------+
|                 Log|First Server|Second Server|Communication Type|Process ID|
+--------------------+------------+-------------+------------------+----------+
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         0|
|[S-1.2, null, Res...|       S-1.2|         null|          Response|         0|
|[null, S-1.3, Req...|        null|        S-1.3|           Request|         1|
|[S-1.3, S-48.2, R...|       S-1.3|       S-48.2|           Request|         1|
|[S-48.2, S-27.3, ...|      S-48.2|       S-27.3|           Request|         1|
|[S-27.3, S-48.2, ...|      S-27.3|       S-48.2|          Response|         1|
|[S-48.2, S-1.3, R...|      S-48.2|        S-1.3|          Response|         1|
|[S-1.3, null, Res...|       S-1.3|         null|          Response|         1|
|[null, S-1.2, Req...|        null|        S-1.2|           Request|         2|
|[S-1.2, null, Res...|       S-1.2|     

In [9]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+
|Process ID|        First Server|       Second Server|             Servers|
+----------+--------------------+--------------------+--------------------+
|         7|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|[S-23.1, S-1.1, S...|
|        15|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|[S-1.1, S-38.1, n...|
|        11|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|[S-44.4, S-2.3, S...|
|         3|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|[S-1.3, S-28.3, S...|
|         8|       [S-1.2, null]|       [S-1.2, null]|       [S-1.2, null]|
|        16|[S-1.1, S-23.3, n...|[S-1.1, S-23.3, n...|[S-1.1, S-23.3,

In [10]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

+----------+---------------------+
|Process ID|Characteristic Matrix|
+----------+---------------------+
|         7| (45,[0,3,4,5,27,3...|
|        15| (45,[0,3,11],[1.0...|
|        11| (45,[0,2,5,16,22,...|
|         3| (45,[0,2,10,19,20...|
|         8| (45,[0,1],[1.0,1.0])|
|        16| (45,[0,3,34],[1.0...|
|         0| (45,[0,1],[1.0,1.0])|
|         5| (45,[0,2,13,15],[...|
|        18| (45,[0,2,12,41],[...|
|        17| (45,[0,2,8,21,23,...|
|         6| (45,[0,1],[1.0,1.0])|
|        19| (45,[0,3,4,7,14,1...|
|         9| (45,[0,1,6,25,38]...|
|         1| (45,[0,2,6,31],[1...|
|        10| (45,[0,3],[1.0,1.0])|
|         4| (45,[0,2,9],[1.0,...|
|        12| (45,[0,1],[1.0,1.0])|
|        13| (45,[0,1,24],[1.0...|
|        14| (45,[0,1,17,26],[...|
|         2| (45,[0,1],[1.0,1.0])|
+----------+---------------------+

Rows of Characteristic Matrix:  ['null', 'S-1.2', 'S-1.3',

In [11]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Buckets", numHashTables=2)
model = minhash.fit(char_matrix)
buckets = model.transform(char_matrix)

buckets.show()

+----------+---------------------+--------------------+
|Process ID|Characteristic Matrix|             Buckets|
+----------+---------------------+--------------------+
|         7| (45,[0,3,4,5,27,3...|[[4.93447561E8], ...|
|        15| (45,[0,3,11],[1.0...|[[3.7583083E7], [...|
|        11| (45,[0,2,5,16,22,...|[[1.76222501E8], ...|
|         3| (45,[0,2,10,19,20...|[[1.00245088E8], ...|
|         8| (45,[0,1],[1.0,1.0])|[[7.97357213E8], ...|
|        16| (45,[0,3,34],[1.0...|[[3.28177327E8], ...|
|         0| (45,[0,1],[1.0,1.0])|[[7.97357213E8], ...|
|         5| (45,[0,2,13,15],[...|[[7.213798E8], [8...|
|        18| (45,[0,2,12,41],[...|[[7.213798E8], [2...|
|        17| (45,[0,2,8,21,23,...|[[2.52199914E8], ...|
|         6| (45,[0,1],[1.0,1.0])|[[7.97357213E8], ...|
|        19| (45,[0,3,4,7,14,1...|[[3.41492735E8], ...|
|         9| (45,[0,1,6,25,38]...|[[2.4267675E7], [...|
|         1| (45,[0,2,6,31],[1...|[[4.17470148E8], ...|
|        10| (45,[0,3],[1.0,1.0])|[[6.45402387E8

In [12]:
similar_pairs = model.approxSimilarityJoin(buckets, buckets, 1, distCol="JaccardDistance")
similar_pairs.show()

+--------------------+--------------------+-------------------+
|            datasetA|            datasetB|    JaccardDistance|
+--------------------+--------------------+-------------------+
|{16, (45,[0,3,34]...|{15, [[3.7583083E...|                0.5|
|{14, (45,[0,1,17,...|{8, [[7.97357213E...|                0.5|
|{15, (45,[0,3,11]...|{16, [[3.28177327...|                0.5|
|{3, (45,[0,2,10,1...|{1, [[4.17470148E...|                0.8|
|{4, (45,[0,2,9],[...|{1, [[4.17470148E...|                0.6|
|{18, (45,[0,2,12,...|{1, [[4.17470148E...| 0.6666666666666667|
|{14, (45,[0,1,17,...|{2, [[7.97357213E...|                0.5|
|{2, (45,[0,1],[1....|{13, [[7.97357213...|0.33333333333333337|
|{8, (45,[0,1],[1....|{14, [[7.97357213...|                0.5|
|{12, (45,[0,1],[1...|{0, [[7.97357213E...|                0.0|
|{4, (45,[0,2,9],[...|{18, [[7.213798E8...|                0.6|
|{17, (45,[0,2,8,2...|{17, [[2.52199914...|                0.0|
|{3, (45,[0,2,10,1...|{4, [[1.89537909E.

In [13]:
zero_distance_processes = similar_pairs.filter(col("JaccardDistance") == 0)
zero_distance_processes.show()

+--------------------+--------------------+---------------+
|            datasetA|            datasetB|JaccardDistance|
+--------------------+--------------------+---------------+
|{12, (45,[0,1],[1...|{0, [[7.97357213E...|            0.0|
|{17, (45,[0,2,8,2...|{17, [[2.52199914...|            0.0|
|{0, (45,[0,1],[1....|{0, [[7.97357213E...|            0.0|
|{3, (45,[0,2,10,1...|{3, [[1.00245088E...|            0.0|
|{13, (45,[0,1,24]...|{13, [[7.97357213...|            0.0|
|{12, (45,[0,1],[1...|{12, [[7.97357213...|            0.0|
|{12, (45,[0,1],[1...|{6, [[7.97357213E...|            0.0|
|{8, (45,[0,1],[1....|{2, [[7.97357213E...|            0.0|
|{6, (45,[0,1],[1....|{12, [[7.97357213...|            0.0|
|{12, (45,[0,1],[1...|{8, [[7.97357213E...|            0.0|
|{6, (45,[0,1],[1....|{0, [[7.97357213E...|            0.0|
|{15, (45,[0,3,11]...|{15, [[3.7583083E...|            0.0|
|{8, (45,[0,1],[1....|{8, [[7.97357213E...|            0.0|
|{16, (45,[0,3,34]...|{16, [[3.28177327.

In [14]:
process_a_b = zero_distance_processes.select(
    col("datasetA.Process ID").alias("ProcessA"),
    col("datasetB.Process ID").alias("ProcessB"),
)
process_a_b.show()

+--------+--------+
|ProcessA|ProcessB|
+--------+--------+
|      12|       0|
|      17|      17|
|       0|       0|
|       3|       3|
|      13|      13|
|      12|      12|
|      12|       6|
|       8|       2|
|       6|      12|
|      12|       8|
|       6|       0|
|      15|      15|
|       8|       8|
|      16|      16|
|       0|       2|
|       0|       6|
|       2|       6|
|       4|       4|
|       2|       2|
|       0|      12|
+--------+--------+
only showing top 20 rows



In [15]:
grouped = process_a_b.groupBy("ProcessA").agg(collect_set("ProcessB").alias("Process Set"))
grouped.show()

+--------+----------------+
|ProcessA|     Process Set|
+--------+----------------+
|       7|             [7]|
|      15|            [15]|
|      11|            [11]|
|       3|             [3]|
|       8|[12, 2, 8, 6, 0]|
|      16|            [16]|
|       0|[12, 2, 8, 6, 0]|
|       5|             [5]|
|      18|            [18]|
|      17|            [17]|
|       6|[12, 2, 8, 6, 0]|
|      19|            [19]|
|       9|             [9]|
|       1|             [1]|
|      10|            [10]|
|       4|             [4]|
|      12|[12, 2, 8, 6, 0]|
|      13|            [13]|
|      14|            [14]|
|       2|[12, 2, 8, 6, 0]|
+--------+----------------+



In [16]:
# TODO:
# this currently only works because currently self distance is being calculated (= 0)
# if this is fixed, may need to first create a column merging process A and Process Set then grabbing distinct 
unique_processes_set = grouped.select(col("Process Set")).distinct()
unique_processes_set.show()

+----------------+
|     Process Set|
+----------------+
|             [7]|
|            [15]|
|            [11]|
|             [3]|
|            [16]|
|             [5]|
|            [18]|
|            [17]|
|[12, 2, 8, 6, 0]|
|            [19]|
|             [9]|
|             [1]|
|            [10]|
|             [4]|
|            [13]|
|            [14]|
+----------------+



In [17]:
unique_processes_set_log = unique_processes_set.join(grouped_df, element_at(col("Process Set"), 1) == col("Process ID"),
    "inner").select(unique_processes_set['Process Set'], grouped_df['Log'])

In [18]:
# perhaps for part1Output.txt
unique_processes_set_log.show()

+----------------+--------------------+
|     Process Set|                 Log|
+----------------+--------------------+
|             [7]|[[null, S-1.1, Re...|
|            [15]|[[null, S-1.1, Re...|
|            [11]|[[null, S-1.3, Re...|
|             [3]|[[null, S-1.3, Re...|
|            [16]|[[null, S-1.1, Re...|
|             [5]|[[null, S-1.3, Re...|
|            [18]|[[null, S-1.3, Re...|
|            [17]|[[null, S-1.3, Re...|
|[12, 2, 8, 6, 0]|[[null, S-1.2, Re...|
|            [19]|[[null, S-1.1, Re...|
|             [9]|[[null, S-1.2, Re...|
|             [1]|[[null, S-1.3, Re...|
|            [10]|[[null, S-1.1, Re...|
|             [4]|[[null, S-1.3, Re...|
|            [13]|[[null, S-1.2, Re...|
|            [14]|[[null, S-1.2, Re...|
+----------------+--------------------+



In [19]:
# for part1Observation.txt
all_processes = unique_processes_set.withColumn("Process ID", explode(col("Process Set")))
joined_log = all_processes.join(grouped_df, "Process ID", "inner").select("Process Set", "Process ID", "Log").collect()

In [20]:
with open("part1Observations.txt", "w") as file:
    current_group = None
    for row in joined_log:
        process_set = row["Process Set"]
        process_id = row["Process ID"]
        log = row["Log"]
        
        if process_set != current_group:
            file.write("\n")
            process_set_string = ', '.join(str(x) for x in process_set)
            file.write(f"Group: {{{process_set_string}}}\n")
            current_group = process_set
        
        file.write(f"\n{process_id}:\n")
        for l in log:
            log_concat = ', '.join(str(x) for x in l)
            file.write(f"<{log_concat}>\n")

24/06/15 17:10:28 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
