# PySpark

## Init Engine

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('pyspark-jaeger').getOrCreate()

#set timestamp
spark.sql("set spark.sql.session.timeZone=Asia/Shanghai")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,,pyspark,idle,,,✔


SparkSession available as 'spark'.
DataFrame[key: string, value: string]

# Data loading

## Create DataFrame from CSV file

In [2]:
df = spark.read. \
    option("header", "true"). \
    option("inferSchema", "true"). \
    option("delimiter", ","). \
    option("escape", "\""). \
    csv("/data/7c1452a231ddf8a6.csv")
    
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- operationName: string (nullable = true)
 |-- startTime: long (nullable = true)
 |-- duration: integer (nullable = true)
 |-- spanID: string (nullable = true)
 |-- references: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- logs: string (nullable = true)

## Normalize schema

In [3]:
from pyspark.sql.functions import *

columns_to_drop = ['_c0', 'warnings']

normDf = df. \
    withColumn('startTime', from_unixtime(col('startTime') / 1000000).cast("timestamp")). \
    drop(*columns_to_drop)
    
normDf.printSchema()

root
 |-- operationName: string (nullable = true)
 |-- startTime: timestamp (nullable = true)
 |-- duration: integer (nullable = true)
 |-- spanID: string (nullable = true)
 |-- references: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- logs: string (nullable = true)

# Data cleaning

## Sampling

In [None]:
normDf.show(1, truncate=False)

In [None]:
normDf.groupBy(col("operationName")). \
    count(). \
    orderBy(desc("count")). \
    show(truncate=False)

### Data preparation & ordering

In [8]:
import json

# Function to convert JSON array string to a list
def parse_json(array_str):
    json_obj = json.loads(array_str)
    for item in json_obj:
       yield (item["key"], item["type"], item["value"])

def parse_all(array_str):
    return list(parse_json(array_str.replace("\'", "\"")))

In [12]:
json_str = '''[{'key': 'internal.span.format', 'type': 'string', 'value': 'proto'}, {'key': 'sampler.param', 'type': 'bool', 'value': 'True'}, {'key': 'sampler.type', 'type': 'string', 'value': 'const'}]'''
print(parse_all(json_str))

[(u'internal.span.format', u'string', u'proto'), (u'sampler.param', u'bool', u'True'), (u'sampler.type', u'string', u'const')]

In [None]:
from pyspark.sql.functions import udf

# https://kontext.tech/column/spark/284/pyspark-convert-json-string-column-to-array-of-object-structtype-in-data-frame

# Define the schema
from pyspark.sql.types import *

json_schema = ArrayType(StructType([StructField('key', StringType(), nullable=False), \
                                    StructField('type', StringType(), nullable=False), \
                                    StructField('value', StringType(), nullable=False)]))

udf_parse_json = udf(lambda str: parse_all(str), json_schema)

cleanDf = normDf.withColumn("tags_json", udf_parse_json(col("tags")))
cleanDf.show(1, truncate=False)