# Task 2

In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, flatten
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, first, count
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import subprocess

In [2]:
partition = 1000

In [3]:
spark = SparkSession.builder \
    .appName("Projet-Task-2") \
    .master("local[*]") \
    .config("spark.driver.memory", "14G") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.executor.memory", "10G") \
    .config("spark.executor.memoryOverhead", "2G") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
    .config("spark.sql.codegen.wholeStage", "false") \
    .getOrCreate()
# spark.sparkContext.setLogLevel("DEBUG")

spark.conf.set("spark.sql.shuffle.partitions", partition)
spark.conf.set("spark.sql.debug.maxToStringFields", 100)
spark

24/06/27 00:15:32 WARN Utils: Your hostname, abha-ThinkPad-P14s-Gen-4 resolves to a loopback address: 127.0.1.1; using 192.168.178.94 instead (on interface wlp2s0)
24/06/27 00:15:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/27 00:15:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/27 00:15:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = spark.read.text("output/part1Output.txt").toDF("Log")

In [5]:
df = df.withColumn("Log", regexp_replace(col("Log"), r"^\d+:\n", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), r", \d+\n", "\n"))
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

# df.printSchema()
# df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    # print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df = df.filter(col("Process ID").isNotNull())

df.printSchema()
# df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)



In [6]:
# Remove Process ID from request 
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df = df.withColumn('Refined Log', log('Log')) 

In [7]:
df.show()

                                                                                

+--------------------+------------+-------------+------------------+--------------+--------------------+
|                 Log|First Server|Second Server|Communication Type|    Process ID|         Refined Log|
+--------------------+------------+-------------+------------------+--------------+--------------------+
|[null, S-3, Reque...|        null|          S-3|           Request|17194363075607|[null, S-3, Request]|
|[S-3, S-9.1, Requ...|         S-3|        S-9.1|           Request|17194363075607|[S-3, S-9.1, Requ...|
|[S-9.1, S-12, Req...|       S-9.1|         S-12|           Request|17194363075607|[S-9.1, S-12, Req...|
|[S-12, S-14, Requ...|        S-12|         S-14|           Request|17194363075607|[S-12, S-14, Requ...|
|[S-14, S-17, Requ...|        S-14|         S-17|           Request|17194363075607|[S-14, S-17, Requ...|
|[S-17, S-14, Resp...|        S-17|         S-14|          Response|17194363075607|[S-17, S-14, Resp...|
|[S-14, S-12, Resp...|        S-14|         S-12|      

In [8]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Refined Log").alias("Refined Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"),
                                          collect_list("Log").alias("Log"))

grouped_df.printSchema()
# grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Refined Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = false)



In [9]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
# distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [10]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
# char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

Rows of Characteristic Matrix:  ['S-3', 'null', 'S-5', 'S-8', 'S-9.2', 'S-9.1', 'S-18', 'S-12', 'S-17', 'S-4', 'S-1', 'S-2', 'S-13', 'S-14', 'S-16', 'S-6', 'S-15', 'S-10.2', 'S-10.1', 'S-11', 'S-19.1', 'S-19.2', 'S-7']


In [11]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

# signatures.show()

In [12]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.4, distCol="Jaccard Distance")
# similar_pairs.show()

In [13]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance") \
                    # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [14]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [15]:
# Creating a dataframe with the pairwise similarity metric 
similarities = similar_pairs.groupBy("Process ID A").pivot("Process ID B")\
                                                .agg(first("Jaccard Distance")).fillna(1.0)

                                                                                

In [16]:
# Create a dataframe with each Process ID and the corresponding feature vectors
feature_columns = similarities.columns[1:]  

assembler = VectorAssembler(inputCols=feature_columns, outputCol="Feature Vectors")
features = assembler.transform(similarities).select("Process ID A", "Feature Vectors")

## Clustering

In [17]:
cluster_number = 50

kmeans = KMeans(k=cluster_number, seed=33, featuresCol="Feature Vectors", predictionCol="Cluster Number")
model = kmeans.fit(features)

# bisecting_kmeans = BisectingKMeans(k=cluster_number, seed=33, featuresCol="Feature Vectors", predictionCol="Cluster Number")
# model = bisecting_kmeans.fit(features)

predictions = model.transform(features)

# predictions.select("Process ID A","Cluster Number").show()

24/06/27 00:15:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/06/27 00:15:43 WARN DAGScheduler: Broadcasting large task binary with size 1290.0 KiB
24/06/27 00:15:46 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/06/27 00:15:47 WARN DAGScheduler: Broadcasting large task binary with size 1936.5 KiB
24/06/27 00:15:48 WARN DAGScheduler: Broadcasting large task binary with size 1045.4 KiB
24/06/27 00:15:54 WARN DAGScheduler: Broadcasting large task binary with size 1046.0 KiB
24/06/27 00:15:54 WARN DAGScheduler: Broadcasting large task binary with size 1046.7 KiB
24/06/27 00:15:54 WARN DAGScheduler: Broadcasting large task binary with size 1047.1 KiB
24/06/27

In [18]:
evaluator = ClusteringEvaluator(featuresCol="Feature Vectors", predictionCol="Cluster Number")

silhouette = evaluator.evaluate(predictions, {evaluator.metricName: "silhouette"})
print("Silhouette score: ", silhouette)

24/06/27 00:16:00 WARN DAGScheduler: Broadcasting large task binary with size 1608.0 KiB
24/06/27 00:16:04 WARN DAGScheduler: Broadcasting large task binary with size 1859.5 KiB
[Stage 83:>                                                         (0 + 1) / 1]

Silhouette score:  0.3475636226975157


24/06/27 00:16:07 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
                                                                                

In [19]:
predictions = predictions.join(grouped_df, predictions["Process ID A"] == col("Process ID"))\
                            .select("Process ID", "Refined Log", "Log","Cluster Number")

In [20]:
clusters = predictions.groupBy("Cluster Number").agg(collect_set("Process ID").alias("Process ID"),
                                                     collect_set("Refined Log").alias("Refined Log"),
                                                     collect_set("Log").alias("Log"), 
                                                     count("Process ID").alias("Members Count")) \
                                                     .orderBy(col("Members Count").desc())

# clusters.show()

In [55]:
clusters.show()

24/06/27 00:35:30 WARN DAGScheduler: Broadcasting large task binary with size 1866.2 KiB
24/06/27 00:35:34 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
                                                                                

+--------------+--------------------+--------------------+--------------------+-------------+
|Cluster Number|          Process ID|         Refined Log|                 Log|Members Count|
+--------------+--------------------+--------------------+--------------------+-------------+
|            48|[17194363074062, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|          126|
|             5|[17194363071246, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|           81|
|            33|[17194363073385, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|           61|
|            41|[17194363071426, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|           61|
|            15|[17194363072690, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|           48|
|            29|[17194363071846, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|           48|
|            34|[17194363077324, ...|[[[null, S-3, Req...|[[[null, S-3, Req...|           43|
|            14|[17194363079516, ...|[[[null, S-3, Req...|[[

In [99]:
def format_group(process_set):
    process_set_string = ', '.join(str(x) for x in process_set)
    return f"Group: {{{process_set_string}}}"

def format_log(logs):
    log_formatted = ""
    for log in logs:
        log_concat = '\n'.join(str(x) for x in log)
        log_formatted += f"{log_concat}\n\n"
    return log_formatted

def format_group_logs(group, logs):
    formatted = f"{group}\n\n" + "\n".join(logs) 
    return formatted

In [100]:
# UDFs for Formatting Output - part1Observations.txt
format_group_udf = udf(format_group, StringType())
format_udf = udf(format_log, StringType())
final_format_udf = udf(format_group_logs, StringType())

In [101]:
formatted_group = clusters.withColumn("Group", format_group_udf(col("Process ID")))
formatted_df = formatted_group.withColumn("Formatted Log", format_udf(col("Log")))
grouped_logs = formatted_df.groupBy("Group").agg(collect_list("Formatted Log").alias("Group Log"))
final_formatted = grouped_logs.withColumn("Formatted", final_format_udf(col("Group"), col("Group Log"))).select("Formatted")

In [102]:
final_formatted.coalesce(partition).write.mode('overwrite').text('part2Observations')

24/06/27 00:53:00 WARN DAGScheduler: Broadcasting large task binary with size 1861.7 KiB
24/06/27 00:53:03 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
24/06/27 00:53:03 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
24/06/27 00:53:03 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
24/06/27 00:53:03 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


In [103]:
subprocess.run("mkdir -p output && cat part2Observations/part-* > output/part2Observations.txt", shell=True)
subprocess.run("find part2Observations/ -name 'part-*' -delete", shell=True)
subprocess.run("rm -f part2Observations/.*", shell=True)
subprocess.run("rm -f part2Observations/_SUCCESS", shell=True) 
subprocess.run("rmdir part2Observations", shell=True)

rm: cannot remove 'part2Observations/.': Is a directory
rm: cannot remove 'part2Observations/..': Is a directory


CompletedProcess(args='rmdir part2Observations', returncode=0)

In [25]:
# clusters = clusters.select("Log")
# # clusters = clusters.withColumn("Cluster Number", col("Cluster Number").cast("string"))
# # clusters = clusters.withColumn("Process ID", col("Process ID").cast("string"))
# clusters = clusters.withColumn("Log", col("Log").cast("string"))
# clusters.write.mode('overwrite').text("task2")

In [26]:
clusters.printSchema()

root
 |-- Cluster Number: integer (nullable = false)
 |-- Process ID: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Refined Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: array (containsNull = false)
 |    |    |    |-- element: string (containsNull = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: array (containsNull = false)
 |    |    |    |-- element: string (containsNull = false)
 |-- Members Count: long (nullable = false)

