In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list, explode, concat, collect_set, array_union, flatten
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [2]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "4G") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.executor.memory", "16G") \
    .config("spark.executor.memoryOverhead", "4G") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
    .getOrCreate()
# spark.sparkContext.setLogLevel("DEBUG")
spark.conf.set("spark.sql.shuffle.partitions", "700")
spark

24/06/24 10:57:57 WARN Utils: Your hostname, abha-ThinkPad-P14s-Gen-4 resolves to a loopback address: 127.0.1.1; using 192.168.178.94 instead (on interface wlp2s0)
24/06/24 10:57:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/24 10:57:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# spark.stop()

In [4]:
df = spark.read.text("data.txt").toDF("Log")

In [5]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

# df.printSchema()
# df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    # print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
# df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)



In [6]:
# Remove Process ID from request 
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df = df.withColumn('Log', log('Log')) 

In [7]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
# grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [8]:
# df.show()

In [9]:
distinct_servers_df = df.groupBy("Process ID").agg(collect_set("First Server").alias("First Server"),
                                                   collect_set("Second Server").alias("Second Server"))

distinct_servers_df = distinct_servers_df.withColumn("Servers", array_union("First Server", "Second Server"))

distinct_servers_df.printSchema()
# distinct_servers_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Servers: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [10]:
characteristics = CountVectorizer(inputCol="Servers", outputCol="Characteristic Matrix")

model = characteristics.fit(distinct_servers_df)
char_matrix = model.transform(distinct_servers_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
# char_matrix.show()

servers = model.vocabulary
print("Rows of Characteristic Matrix: ", servers)

                                                                                

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

Rows of Characteristic Matrix:  ['null', 'S-1.1', 'S-1.6', 'S-1.2', 'S-1.7', 'S-1.5', 'S-1.4', 'S-1.3', 'S-85.1', 'S-85.3', 'S-85.2', 'S-85.4', 'S-82.4', 'S-82.3', 'S-82.2', 'S-93.5', 'S-54.5', 'S-32.1', 'S-93.2', 'S-9.3', 'S-27.3', 'S-31.2', 'S-93.3', 'S-45.1', 'S-32.2', 'S-29.5', 'S-45.4', 'S-41.1', 'S-82.5', 'S-40.2', 'S-40.3', 'S-40.4', 'S-93.1', 'S-61.5', 'S-27.2', 'S-25.1', 'S-21.4', 'S-20.1', 'S-29.2', 'S-31.4', 'S-25.2', 'S-31.3', 'S-32.3', 'S-38.4', 'S-29.1', 'S-29.3', 'S-16.5', 'S-20.2', 'S-20.4', 'S-32.4', 'S-83.1', 'S-61.4', 'S-50.5', 'S-60.5', 'S-27.1', 'S-25.5', 'S-93.4', 'S-17.5', 'S-20.3', 'S-38.2', 'S-54.3', 'S-68.2', 'S-45.3', 'S-81.5', 'S-29.4', 'S-19.3', 'S-20.5', 'S-27.4', 'S-36.2', 'S-45.2', 'S-52.1', 'S-69.2', 'S-24.6', 'S-50.4', 'S-50.3', 'S-71.1', 'S-9.1', 'S-31.5', 'S-5.1', 'S-81.6', 'S-45.5', 'S-52.4', 'S-32.5', 'S-86.3', 'S-76.1', 'S-5.5', 'S-40.5', 'S-55.2',

In [11]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

# signatures.show()

In [12]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.01, distCol="Jaccard Distance")
# similar_pairs.show()

In [13]:
# similar_pairs.count()

In [14]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance") \
                    # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [15]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [16]:
# similar_pairs.show()

In [17]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [18]:
# pairs.show()

In [19]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [20]:
from pyspark.sql.functions import col, count
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A").alias("Process Set"))

In [21]:
same_pairs.printSchema()

root
 |-- Log A: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Process Set: array (nullable = false)
 |    |-- element: string (containsNull = false)



### Output - part1Observations.txt

In [22]:
# same_pairs.show()

In [23]:
same_pairs_explode = same_pairs.select(same_pairs["Log A"], same_pairs["Process Set"], explode(same_pairs["Process Set"]).alias("Process ID"))
# same_pairs_explode.show()

In [24]:
# same_pairs_explode.first()[0]

In [25]:
def format_group(process_set):
    process_set_string = ', '.join(str(x) for x in process_set)
    return f"Group: {{{process_set_string}}}"

def format_log(log, process_id):
    # log_final_string = f"\n"
    log_formatted = ""
    for l in log:
        log_concat = ', '.join(str(x) for x in l)
        log_formatted += f"<{log_concat}, {process_id}>\n"
    return log_formatted

format_group_udf = udf(format_group, StringType())
formatted_group = same_pairs_explode.withColumn("Group", format_group_udf(col("Process Set")))
# formatted_group.show()

format_udf = udf(format_log, StringType())
formatted_df = formatted_group.withColumn("Formatted Log", format_udf(col("Log A"), col("Process ID")))
# formatted_df.show()

grouped_logs = formatted_df.groupBy("Group").agg(collect_list("Formatted Log").alias("Group Log"))
# grouped_logs.show()
# grouped_logs.first()[1]
# print(grouped_logs.dtypes)

def format_group_logs(group, logs):
    formatted = f"{group}\n\n" + "\n".join(logs) 
    return formatted

final_format_udf = udf(format_group_logs, StringType())
final_formatted = grouped_logs.withColumn("Formatted", final_format_udf(col("Group"), col("Group Log"))).select("Formatted")
# final_formatted.show()

In [26]:
# final_formatted.rdd.getNumPartitions()

In [27]:
# partitions = 100
final_formatted.coalesce(700).write.mode('overwrite').text('output')
# final_formatted.write.mode('overwrite').text('output')

24/06/24 10:58:13 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [28]:
import subprocess
subprocess.run("cat output/part-* > output/part1Observations.txt", shell=True)
subprocess.run("find output/ -name 'part-*' -delete", shell=True)
subprocess.run("rm -f output/.*", shell=True)
subprocess.run("rm -f output/_SUCCESS", shell=True) 

rm: cannot remove 'output/.': Is a directory
rm: cannot remove 'output/..': Is a directory


CompletedProcess(args='rm -f output/_SUCCESS', returncode=0)

In [27]:
# same_pairs_col = same_pairs.collect()

### HAVENT CHANGED CODE BELOW

In [None]:
# with open("part1Observations.txt", "w") as file:
#     current_group = None
#     for row in same_pairs_col:
#         process_set = row["Process Set"]
#         # process_id = row["Process ID"]
#         log = row["Log A"]
        
#         if process_set != current_group:
#             file.write("\n")
#             process_set_string = ', '.join(str(x) for x in process_set)
#             file.write(f"Group: {{{process_set_string}}}\n")
#             current_group = process_set
        
#         for process in process_set:
#             file.write(f"\n{process}:\n")
#             for l in log:
#                 log_concat = ', '.join(str(x) for x in l)
#                 file.write(f"<{log_concat}>\n")

### Output - part1Output.txt

In [None]:
df.dtypes

[('Log', 'array<string>'),
 ('First Server', 'string'),
 ('Second Server', 'string'),
 ('Communication Type', 'string'),
 ('Process ID', 'string')]

In [None]:
from pyspark.sql.types import IntegerType

df = df.withColumn('pid_integer', df['Process ID'].cast(IntegerType()))
max_process_id = df.agg({"pid_integer": "max"}).collect()[0][0]
# print(max_process_id)

In [None]:
with open("part1Output.txt", "w") as file:
    current_group = None
    process_id = max_process_id+1

    for row in same_pairs_col:
        log = row["Log A"]
        
        file.write(f"\n{process_id}:\n")
        for l in log:
            log_concat = ', '.join(str(x) for x in l)
            log_concat = f"{log_concat}, {process_id}"
            file.write(f"<{log_concat}>\n")
        process_id+=1

In [None]:
from pyspark.sql.types import IntegerType


# Shingling

In [None]:
def k_shingling(text, k):
    shingles = set()
    for i in range(len(text) - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return list(shingles)

k_shingling_udf = udf(lambda text: k_shingling(text, 5), ArrayType(StringType()))

In [None]:
df = spark.read.text("data.txt").toDF("Log")
log = udf(lambda x: x[1:-4], StringType()) 
df = df.withColumn('Log_split', log('Log')) 
df.collect()
df_shingles = df.withColumn("Shingles", k_shingling_udf(df["Log_split"]))
# df_shingles.show()

In [None]:
# df_shingles.collect()

In [None]:
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df_shingles = df_shingles.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df_shingles = df_shingles.withColumn("Log", split(col("Log"), " "))

df_shingles.printSchema()
# df_shingles.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df_shingles = df_shingles.withColumn(columns[i], col("Log")[i])

df_shingles.printSchema()
# df_shingles.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Log_split: string (nullable = true)
 |-- Shingles: array (nullable = true)
 |    |-- element: string (containsNull = true)

First Server
Second Server
Communication Type
Process ID
root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Log_split: string (nullable = true)
 |-- Shingles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)



In [None]:
log = udf(lambda x: x[:-1], ArrayType(StringType())) 
df_shingles = df_shingles.withColumn('Log', log('Log')) 

In [None]:
grouped_df = df_shingles.groupBy("Process ID").agg(collect_set("Shingles").alias("Shingles"),collect_list("Log").alias("Log"))
grouped_df = grouped_df.withColumn("Flat shingles", flatten(col("Shingles")))
grouped_df.printSchema()

root
 |-- Process ID: string (nullable = true)
 |-- Shingles: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = true)
 |-- Flat shingles: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [None]:
characteristics = CountVectorizer(inputCol="Flat shingles", outputCol="Characteristic Matrix")

model = characteristics.fit(grouped_df)
char_matrix = model.transform(grouped_df).select("Process ID", "Characteristic Matrix")

char_matrix.printSchema()
# char_matrix.show()

shingles = model.vocabulary
print("Rows of Characteristic Matrix: ", shingles)

root
 |-- Process ID: string (nullable = true)
 |-- Characteristic Matrix: vector (nullable = true)

Rows of Characteristic Matrix:  ['espon', ' Resp', 'Respo', 'ponse', ' Requ', ', Res', 'eques', 'quest', 'spons', ', Req', 'Reque', 'uest,', 'onse,', 'est, ', 'nse, ', ', S-1', 'null,', 'ull, ', ' S-1.', 'S-1.1', 'S-1.2', '1, Re', '2, Re', '3, Re', 'l, Re', 'll, R', ' null', 'll, S', ', nul', 'l, S-', '1, S-', '4, Re', '5, Re', '2, S-', '3, S-', ', S-6', '6, Re', '8, Re', ', S-5', '4, S-', ', S-8', ', S-7', ', S-3', '7, Re', '5, S-', '9, Re', ', S-2', ', S-4', '0, Re', '6, S-', '8, S-', '7, S-', ', S-9', '9, S-', 'S-1.3', '0, S-', '.13, ', '.10, ', '.28, ', '.25, ', 'st, 6', 'se, 6', '.11, ', '.23, ', '.21, ', '.15, ', '.26, ', '.24, ', '.29, ', 'se, 7', 'st, 7', '.12, ', '.20, ', 'st, 2', 'se, 2', '.19, ', '.22, ', '.17, ', '.18, ', '.14, ', '.27, ', '.16, ', 'se, 9', 'st, 9', 'se, 1', 'st, 1', 'se, 8', 'st, 8', '.30, ', 'se, 3', 'st, 3', 'st, 4', 'se, 4', '.31, ', 'se, 5', 'st, 5', '.

In [None]:
minhash = MinHashLSH(inputCol="Characteristic Matrix", outputCol="Signatures", numHashTables=5)

# MinHash produces the signatures for the Characteritic matrix 
# numvHashTables is the number of the hush functioms that we want to use and the lenght of the signature 
model = minhash.fit(char_matrix)
signatures = model.transform(char_matrix)

# signatures.show()

In [None]:
# approxSimilarityJoin uses autmatically LSH to find rows that it is most likely 
# to have same "Signatures"
# threshold: pairs with Jaccard Distance lower than threshlod
similar_pairs = model.approxSimilarityJoin(signatures, signatures, threshold=0.2, distCol="Jaccard Distance")
# similar_pairs.show()

In [None]:
# similar_pairs.count()

In [None]:
similar_pairs = similar_pairs.select("datasetA.Process ID", "datasetB.Process ID", 
                     "Jaccard Distance")\
                        # .filter((col("datasetA.Process ID") != col("datasetB.Process ID")))

In [None]:
# similar_pairs.show()

In [None]:
new_cols = ["Process ID A", "Process ID B", "Jaccard Distance"]
similar_pairs = similar_pairs.toDF(*new_cols)

In [None]:
# similar_pairs.show()

In [None]:
# grouped_df.show()

In [None]:
pairs = similar_pairs.join(grouped_df, similar_pairs["Process ID A"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log").alias("Log A")) \
                     .join(grouped_df, similar_pairs["Process ID B"] == col("Process ID")) \
                     .select(col("Process ID A"), col("Process ID B"), col("Log A"), col("Log").alias("Log B"))

In [None]:
# pairs.collect()

In [None]:
def original_check(x,y):
    return x==y

orifinal_checking = udf(original_check, BooleanType())
same_pairs = pairs.filter(orifinal_checking(col("Log A"), col("Log B")))

In [None]:
same_pairs = same_pairs.groupBy("Log A").agg(collect_set("Process ID A"))

In [None]:
# same_pairs.select("collect_set(Process ID A)").collect()