In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("json_to_csv").getOrCreate()

In [5]:
from pyspark.sql.functions import col, translate


product_views = spark.read.json(r"\data\product-views.json")

product_views = product_views.withColumn("event", col("event")) \
                             .withColumn("messageid", col("messageid")) \
                             .withColumn("userid", col("userid")) \
                             .withColumn("productid", translate(col("properties").cast("string"), "[]","")) \
                             .withColumn("source", translate(col("context").cast("string"), "[]", "")) \
                             .drop("context").drop("properties")

product_views.show(5, truncate=False)

product_views.write.csv('product_views.csv', header='true')

+-----------+------------------------------------+--------+-----------+----------+
|event      |messageid                           |userid  |productid  |source    |
+-----------+------------------------------------+--------+-----------+----------+
|ProductView|6b1291ea-e50d-425b-9940-44c2aff089c1|user-78 |product-173|desktop   |
|ProductView|b8b14eb3-8e39-42a3-9528-a323b10a7686|user-74 |product-230|desktop   |
|ProductView|23db4f01-1576-415d-83cd-dce2d1bd28a2|user-155|product-38 |mobile-app|
|ProductView|9a01da3e-d1ea-43da-826e-333a6867ef57|user-228|product-42 |mobile-app|
|ProductView|b057f6a7-44f7-4f6c-a9e7-d17df9ee6088|user-346|product-784|desktop   |
+-----------+------------------------------------+--------+-----------+----------+
only showing top 5 rows



In [7]:
from pyspark.sql.functions import col, explode, translate


orders = spark.read.json(r"C:\Users\DELL\Desktop\case_studies\hepsiburada_cases\data_engineer_exam\data\orders.json")

orders = orders.select(orders.event,
                       orders.messageid,
                       orders.userid,
                       explode(orders.lineitems).alias("lineitems"),
                       orders.orderid) \
                .withColumn("lineitems", translate(col("lineitems").cast("string"), "[]", ""))

orders.show(5, truncate=False)
orders.printSchema()

orders.write.csv('orders.csv', header='true')

+----------+------------------------------------+--------+--------------+-------+
|event     |messageid                           |userid  |lineitems     |orderid|
+----------+------------------------------------+--------+--------------+-------+
|OrderEvent|2db62eb5-de95-4ce8-8161-ab7552dc2fd7|user-346|product-784, 3|50000  |
|OrderEvent|2db62eb5-de95-4ce8-8161-ab7552dc2fd7|user-346|product-173, 1|50000  |
|OrderEvent|f05643d0-29e7-47ca-a1b3-12ba3e8642e3|user-391|product-424, 1|50001  |
|OrderEvent|30d5cb63-63bb-42a2-b27a-673f3a4e7925|user-120|product-393, 3|50002  |
|OrderEvent|60c21fb3-a115-4f10-b595-6f07998de71e|user-79 |product-369, 3|50003  |
+----------+------------------------------------+--------+--------------+-------+
only showing top 5 rows

root
 |-- event: string (nullable = true)
 |-- messageid: string (nullable = true)
 |-- userid: string (nullable = true)
 |-- lineitems: string (nullable = true)
 |-- orderid: long (nullable = true)

