In [104]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd 
import numpy as np 
from pyspark.sql.functions import split, col, regexp_replace, collect_list

In [12]:
spark = SparkSession.builder \
    .appName("Projet-Task-1") \
    .master("local[*]") \
    .config("spark.driver.memory", "2G") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()
spark

In [11]:
spark.stop()

In [144]:
df = spark.read.text("data.txt").toDF("Log")

In [145]:
df = df.withColumn("Log", regexp_replace(col("Log"), "[<>]", ""))
df = df.withColumn("Log", regexp_replace(col("Log"), ",", ""))
df = df.withColumn("Log", split(col("Log"), " "))

df.printSchema()
df.show()

columns = ["First Server", "Second Server", "Communication Type", "Process ID"]

for i in range(len(columns)):
    print(columns[i])
    df = df.withColumn(columns[i], col("Log")[i])

df.printSchema()
df.show()

root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|                 Log|
+--------------------+
|[null, S-1, Reque...|
|[S-1, S-15, Reque...|
|[S-15, S-20, Requ...|
|[S-20, S-15, Resp...|
|[S-15, S-26, Requ...|
|[S-26, S-2, Reque...|
|[S-2, S-26, Respo...|
|[S-26, S-15, Resp...|
|[S-15, S-1, Respo...|
|[S-1, null, Respo...|
|[null, S-1, Reque...|
|[S-1, S-48, Reque...|
|[S-48, S-46, Requ...|
|[S-46, S-14, Requ...|
|[S-14, S-12.1, Re...|
|[S-12.1, S-14, Re...|
|[S-14, S-46, Resp...|
|[S-46, S-48, Resp...|
|[S-48, S-1, Respo...|
|[S-1, null, Respo...|
+--------------------+
only showing top 20 rows

First Server
Second Server
Communication Type
Process ID
root
 |-- Log: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- First Server: string (nullable = true)
 |-- Second Server: string (nullable = true)
 |-- Communication Type: string (nullable = true)
 |-- Process ID: string (nullable = true)

+-----

In [152]:
grouped_df = df.groupBy("Process ID").agg(collect_list("Log").alias("Log"), 
                                          collect_list("First Server").alias("First Server"),
                                          collect_list("Second Server").alias("Second Server"),
                                          collect_list("Communication Type").alias("FCommunication Type"))

grouped_df.printSchema()
grouped_df.show()

root
 |-- Process ID: string (nullable = true)
 |-- Log: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = false)
 |-- First Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- Second Server: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- FCommunication Type: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+--------------------+--------------------+--------------------+--------------------+
|Process ID|                 Log|        First Server|       Second Server| FCommunication Type|
+----------+--------------------+--------------------+--------------------+--------------------+
|         3|[[null, S-1, Requ...|         [null, S-1]|         [S-1, null]| [Request, Response]|
|         0|[[null, S-1, Requ...|[null, S-1, S-15,...|[S-1, S-15, S-20,...|[Request, Request...|
|         1|[[null, S-1, Requ...|[null, S-1