In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, flatten
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [2]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "4G") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/25 12:56:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# spark.stop()

In [4]:
df = spark.read.text("data.txt").toDF("Log")

In [5]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

# df.printSchema()
# df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    # print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
# df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)



In [6]:
# Remove Process ID from request 
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df = df.withColumn('Log', log('Log')) 

In [7]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
# grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [8]:
# df.show()

In [9]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
# distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [10]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
# char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

Rows of Characteristic Matrix:  ['null', 'S-1', 'S-12.2', 'S-12.1', 'S-7.1', 'S-7.2', 'S-9', 'S-14', 'S-17', 'S-11', 'S-15', 'S-8', 'S-16', 'S-5', 'S-6', 'S-3', 'S-13', 'S-10', 'S-18.2', 'S-4', 'S-2', 'S-18.1', 'S-19']


                                                                                

In [11]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

# signatures.show()

In [12]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.01, distCol="Jaccard Distance")
# similar_pairs.show()

In [13]:
# similar_pairs.count()

In [14]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance") \
                    # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [15]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [16]:
# similar_pairs.show()

In [17]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [18]:
# pairs.show()

In [19]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [20]:
from pyspark.sql.functions import col, count
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A").alias("Process Set"))

In [21]:
same_pairs.printSchema()

root
 |-- Log A: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Process Set: array (nullable = false)
 |    |-- element: string (containsNull = false)



### Output - part1Observations.txt

In [22]:
same_pairs_col = same_pairs.collect()

24/06/25 12:56:45 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [23]:
with open("part1Observations.txt", "w") as file:
    current_group = None
    for row in same_pairs_col:
        process_set = row["Process Set"]
        # process_id = row["Process ID"]
        log = row["Log A"]
        
        if process_set != current_group:
            file.write("\n")
            process_set_string = ', '.join(str(x) for x in process_set)
            file.write(f"Group: {{{process_set_string}}}\n")
            current_group = process_set
        
        for process in process_set:
            file.write(f"\n{process}:\n")
            for l in log:
                log_concat = ', '.join(str(x) for x in l)
                file.write(f"<{log_concat}>\n")

### Output - part1Output.txt

In [24]:
df.dtypes

[('Log', 'array<string>'),
 ('First Server', 'string'),
 ('Second Server', 'string'),
 ('Communication Type', 'string'),
 ('Process ID', 'string')]

In [25]:
from pyspark.sql.types import IntegerType

df = df.withColumn('pid_integer', df['Process ID'].cast(IntegerType()))
max_process_id = df.agg({"pid_integer": "max"}).collect()[0][0]
# print(max_process_id)

In [26]:
with open("part1Output.txt", "w") as file:
    current_group = None
    process_id = max_process_id+1

    for row in same_pairs_col:
        log = row["Log A"]
        
        file.write(f"\n{process_id}:\n")
        for l in log:
            log_concat = ', '.join(str(x) for x in l)
            log_concat = f"{log_concat}, {process_id}"
            file.write(f"<{log_concat}>\n")
        process_id+=1

In [27]:
from pyspark.sql.types import IntegerType


# Shingling

In [28]:
def k_shingling(text, k):
    shingles = set()
    for i in range(len(text) - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return list(shingles)

k_shingling_udf = udf(lambda text: k_shingling(text, 5), ArrayType(StringType()))

In [29]:
df = spark.read.text("data.txt").toDF("Log")
log = udf(lambda x: x[1:-4], StringType()) 
df = df.withColumn('Log_split', log('Log')) 
df.collect()
df_shingles = df.withColumn("Shingles", k_shingling_udf(df["Log_split"]))
# df_shingles.show()

In [30]:
# df_shingles.collect()

In [31]:
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df_shingles = df_shingles.withColumn("Log", split(col("Log"), " "))

df_shingles.printSchema()
# df_shingles.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df_shingles = df_shingles.withColumn(columns[i], col("Log")[i])

df_shingles.printSchema()
# df_shingles.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Log_split: string (nullable = true)
 |-- Shingles: array (nullable = true)
 |    |-- element: string (containsNull = true)

First Server
Second Server
Communication Type
Process ID
root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Log_split: string (nullable = true)
 |-- Shingles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)



In [32]:
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df_shingles = df_shingles.withColumn('Log', log('Log')) 

In [33]:
grouped_df = df_shingles.groupBy("Process ID").agg(collect_set("Shingles").alias("Shingles"),collect_list("Log").alias("Log"))
grouped_df = grouped_df.withColumn("Flat shingles", flatten(col("Shingles")))
grouped_df.printSchema()

root
 |-- Process ID: string (nullable = true)
 |-- Shingles: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Flat shingles: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [34]:
characteristics = CountVectorizer(inputCol="Flat shingles", outputCol="Characteristic Matrix")

model = characteristics.fit(grouped_df)
char_matrix = model.transform(grouped_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
# char_matrix.show()

shingles = model.vocabulary
print("Rows of Characteristic Matrix: ", shingles)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

Rows of Characteristic Matrix:  ['S-1, ', ', S-1', 'espon', ' Resp', 'Respo', ' Requ', 'ponse', 'eques', ', Res', 'quest', ', Req', 'Reque', 'spons', 'onse,', 'uest,', 'est, ', 'nse, ', '1, Re', 'null,', 'ull, ', '-1, R', ' S-1,', '1, S-', 'S-12.', 'l, Re', 'll, R', ' null', ', nul', '1, nu', 'l, S-', '-1, n', 'll, S', '2, S-', '2, Re', '.2, R', '.2, S', '-1, S', '12.2,', '-12.2', '2.2, ', ' S-12', '.1, S', '.1, R', '-12.1', '12.1,', '2.1, ', '7.1, ', 'S-7.1', '-7.1,', ', S-7', ' S-7.', '7.2, ', '-7.2,', 'S-7.2', 'S-9, ', 'S-14,', '-14, ', '-17, ', 'S-17,', 'S-11,', '-11, ', 'st, 7', 'se, 7', 'se, 5', 'st, 5', 'st, 8', 'se, 8', 'st, 1', 'se, 1', 'st, 2', 'se, 2', 'se, 3', 'st, 3', 'st, 6', 'st, 9', 'se, 9', 'se, 6', 'se, 4', 'st, 4', '-15, ', 'S-15,', '9, S-', '9, Re', '4, Re', '4, S-', '-9, S', '-9, R', ' S-9,', ', S-9', '14, R', '14, S', ' S-14', '5, S-', '5, Re', 'S-8, ', '-16, ', 'S

In [35]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

# signatures.show()

In [36]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.2, distCol="Jaccard Distance")
# similar_pairs.show()

In [37]:
# similar_pairs.count()

In [38]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance")\
                        # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [39]:
# similar_pairs.show()

In [40]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [41]:
# similar_pairs.show()

In [42]:
# grouped_df.show()

In [43]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [44]:
# pairs.collect()

In [45]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [46]:
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A"))

In [47]:
# same_pairs.select("collect_set(Process ID A)").collect()