In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, flatten
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, first, count
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "4G") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/25 13:02:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/25 13:02:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.text("part1Output.txt").toDF("Log")

In [4]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

# df.printSchema()
# df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    # print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df = df.filter(col("Process ID").isNotNull())

df.printSchema()
# df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)



In [5]:
# Remove Process ID from request 
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df = df.withColumn('Log', log('Log')) 

In [6]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
# grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [7]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
# distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [8]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
# char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

Rows of Characteristic Matrix:  ['null', 'S-1', 'S-12.2', 'S-12.1', 'S-7.1', 'S-7.2', 'S-9', 'S-14', 'S-17', 'S-11', 'S-15', 'S-16', 'S-8', 'S-5', 'S-6', 'S-3', 'S-13', 'S-10', 'S-18.2', 'S-4', 'S-2', 'S-18.1', 'S-19']


In [9]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

# signatures.show()

In [10]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.4, distCol="Jaccard Distance")
# similar_pairs.show()

In [11]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance") \
                    # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [12]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [13]:
grouped_df.show()

+----------+--------------------+--------------------+--------------------+--------------------+
|Process ID|                 Log|        First Server|       Second Server| FCommunication Type|
+----------+--------------------+--------------------+--------------------+--------------------+
|     10000|[[S-1, null, Resp...|         [S-1, null]|         [null, S-1]| [Response, Request]|
|     10001|[[null, S-1, Requ...|[null, S-1, S-12....|[S-1, S-12.1, S-1...|[Request, Request...|
|     10002|[[null, S-1, Requ...|[null, S-1, S-12....|[S-1, S-12.1, S-1...|[Request, Request...|
|     10003|[[null, S-1, Requ...|[null, S-1, S-12....|[S-1, S-12.1, S-1...|[Request, Request...|
|     10004|[[null, S-1, Requ...|[null, S-1, S-12....|[S-1, S-12.1, S-1...|[Request, Request...|
|     10005|[[null, S-1, Requ...|[null, S-1, S-12....|[S-1, S-12.1, S-1...|[Request, Request...|
|     10006|[[null, S-1, Requ...|[null, S-1, S-12....|[S-1, S-12.1, S-1...|[Request, Request...|
|     10007|[[null, S-1, Requ.

In [14]:
# Creating a dataframe with the pairwise similarity metric 
similarities = similar_pairs.groupBy("Process ID A").pivot("Process ID B")\
                                                .agg(first("Jaccard Distance")).fillna(1.0)

                                                                                

In [15]:
# Create a dataframe with each Process ID and the corresponding feature vectors
feature_columns = similarities.columns[1:]  

assembler = VectorAssembler(inputCols=feature_columns, outputCol="Feature Vectors")
features = assembler.transform(similarities).select("Process ID A", "Feature Vectors")

## Clustering

In [17]:
cluster_number = 50

kmeans = KMeans(k=cluster_number, seed=33, featuresCol="Feature Vectors", predictionCol="Cluster Number")
model = kmeans.fit(features)

# bisecting_kmeans = BisectingKMeans(k=cluster_number, seed=33, featuresCol="Feature Vectors", predictionCol="Cluster Number")
# model = bisecting_kmeans.fit(features)

predictions = model.transform(features)

# predictions.select("Process ID A","Cluster Number").show()

24/06/25 13:02:34 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 3078, Column 14: Compiling "project_doConsume_0(InternalRow inputadapter_row_0, ArrayData project_expr_0_0)"
org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 3078, Column 14: Compiling "project_doConsume_0(InternalRow inputadapter_row_0, ArrayData project_expr_0_0)"
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:402)
	at org.codehaus.janino.UnitCompiler.access$000(UnitCompiler.java:236)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:363)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:361)
	at org.codehaus.janino.Java$CompilationUnit.accept(Java.java:371)
	at org.codehaus.janino.UnitCompiler.co

In [18]:
evaluator = ClusteringEvaluator(featuresCol="Feature Vectors", predictionCol="Cluster Number")

silhouette = evaluator.evaluate(predictions, {evaluator.metricName: "silhouette"})
print("Silhouette score: ", silhouette)

24/06/25 13:03:38 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 3037, Column 14: Compiling "project_doConsume_0(InternalRow inputadapter_row_0, ArrayData project_expr_0_0)"
org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 3037, Column 14: Compiling "project_doConsume_0(InternalRow inputadapter_row_0, ArrayData project_expr_0_0)"
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:402)
	at org.codehaus.janino.UnitCompiler.access$000(UnitCompiler.java:236)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:363)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:361)
	at org.codehaus.janino.Java$CompilationUnit.accept(Java.java:371)
	at org.codehaus.janino.UnitCompiler.co

KeyboardInterrupt: 

                                                                                

In [19]:
predictions = predictions.join(grouped_df, predictions["Process ID A"] == col("Process ID"))\
                            .select("Process ID", "Log","Cluster Number")

In [None]:
predictions.show()

+----------+--------------------+--------------+
|Process ID|                 Log|Cluster Number|
+----------+--------------------+--------------+
|        15|[[null, S-1.1, Re...|             0|
|        11|[[null, S-1.3, Re...|             1|
|        16|[[null, S-1.2, Re...|             2|
|        10|[[null, S-1.2, Re...|             2|
|        12|[[null, S-1.1, Re...|             0|
|        13|[[null, S-1.3, Re...|             1|
|        14|[[null, S-1.3, Re...|             1|
+----------+--------------------+--------------+



In [31]:
clusters = predictions.groupBy("Cluster Number").agg(collect_set("Process ID").alias("Process ID"),
                                                     collect_set("Log").alias("Log"),
                                                     count("Process ID").alias("Members Count")) \
                                                     .orderBy(col("Members Count").desc())

clusters.show()

24/06/25 13:17:18 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 2958, Column 16: Compiling "processNext()"
org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 2958, Column 16: Compiling "processNext()"
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:402)
	at org.codehaus.janino.UnitCompiler.access$000(UnitCompiler.java:236)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:363)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:361)
	at org.codehaus.janino.Java$CompilationUnit.accept(Java.java:371)
	at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:361)
	at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:264)
	at org.codehaus.janino.ClassB

KeyboardInterrupt: 

In [35]:
clusters = clusters.select("Log")
# clusters = clusters.withColumn("Cluster Number", col("Cluster Number").cast("string"))
# clusters = clusters.withColumn("Process ID", col("Process ID").cast("string"))
clusters = clusters.withColumn("Log", col("Log").cast("string"))
clusters.write.mode('overwrite').text("/Users/themisgrigorakis/Desktop/Data Intensive Systems/Project/INFOMDIS-Data-Intensive-Systems/output2.txt")

24/06/25 13:19:39 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 2958, Column 16: Compiling "processNext()"
org.codehaus.commons.compiler.InternalCompilerException: Compiling "GeneratedClass" in File 'generated.java', Line 1, Column 1: File 'generated.java', Line 2958, Column 16: Compiling "processNext()"
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:402)
	at org.codehaus.janino.UnitCompiler.access$000(UnitCompiler.java:236)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:363)
	at org.codehaus.janino.UnitCompiler$2.visitCompilationUnit(UnitCompiler.java:361)
	at org.codehaus.janino.Java$CompilationUnit.accept(Java.java:371)
	at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:361)
	at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:264)
	at org.codehaus.janino.ClassB

In [33]:
clusters.printSchema()

root
 |-- Cluster Number: integer (nullable = false)
 |-- Process ID: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: array (containsNull = false)
 |    |    |    |-- element: string (containsNull = true)
 |-- Members Count: long (nullable = false)

