<a href="https://colab.research.google.com/github/dubeyabhi07/big-data-spark/blob/master/src/main/pyspark/event/SimpleToComplex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/e9/e4/5c15ab8d354c4e3528510821865e6748209a9b0ff6a1788f4cd36cc2a5dc/pyspark-2.4.6.tar.gz (218.4MB)
[K     |████████████████████████████████| 218.4MB 61kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 42.8MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.6-py2.py3-none-any.whl size=218814406 sha256=93e2f1bcb442a76187b974c3333cb2013d79c3e757ec271c845d82a28eeaf9e2
  Stored in directory: /root/.cache/pip/wheels/1e/5e/6a/17e906c94ec7246f260330a66e44a06a0809033ba2738a74a8
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.6


In [None]:
from pyspark.sql.functions import *


'''
 processing schedule.csv (was saved in ComplexToSimple.scala)

'''


schDf = spark.read.csv("schedule.csv", inferSchema = True, header = True)

schDf = schDf.withColumn("detail", struct(col("cost"), col("date")))\
  .drop(col("cost")).drop(col("date"))\
  .groupBy(col("event_id"), col("city"))\
  .agg(collect_list("detail").alias("details"))\
  .select(col("event_id"), create_map(col("city"), col("details")).alias("schedule_per_city"))\
  .groupBy(col("event_id"))\
  .agg(collect_list("schedule_per_city").alias("schedule"))

schDf.printSchema()


root
 |-- event_id: string (nullable = true)
 |-- schedule: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: array (valueContainsNull = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- cost: string (nullable = true)
 |    |    |    |    |-- date: string (nullable = true)



In [None]:
from pyspark.sql.types import *

from pyspark.sql.functions import udf
innerMapSchema = MapType(StringType(), StringType(), True)
arrSchema = ArrayType(innerMapSchema, True)
mapSchema = MapType(StringType(), arrSchema, True)
scheduleSchema = MapType(StringType(), mapSchema, True)

def listToMapUdfFunc(val):
  temp = val.replace("]},{", "],")
  temp = temp[1: len(temp) - 1]
  return "{'schedule': " + temp + "}"


listToMapUdf = udf(lambda x : listToMapUdfFunc(x),StringType())


In [None]:
schDf = schDf.select(col("event_id"), to_json(col("schedule")).alias("scheduleStr"))\
  .withColumn("processedScheduleStr", listToMapUdf(col("scheduleStr")))\
  .withColumn("schedule", from_json(col("processedScheduleStr"), scheduleSchema))\
  .drop(col("processedScheduleStr")).drop(col("scheduleStr"))

print("consolidated schedule Schema .........................................")
schDf.show(10)
schDf.printSchema()

consolidated schedule Schema .........................................
+--------------------+--------+
|            event_id|schedule|
+--------------------+--------+
|66729580-d036-475...|    null|
|d644ef66-fe38-4d5...|    null|
|522ef782-1a56-466...|    null|
|c12d9de6-82a8-417...|    null|
|0f815487-fddc-415...|    null|
|7dbafd73-563e-412...|    null|
|1a8672ec-97c3-466...|    null|
|d997f98b-cbb2-47b...|    null|
|e182c7aa-55c7-41e...|    null|
|5ad93a2d-c540-412...|    null|
+--------------------+--------+
only showing top 10 rows

root
 |-- event_id: string (nullable = true)
 |-- schedule: map (nullable = true)
 |    |-- key: string
 |    |-- value: map (valueContainsNull = true)
 |    |    |-- key: string
 |    |    |-- value: array (valueContainsNull = true)
 |    |    |    |-- element: map (containsNull = true)
 |    |    |    |    |-- key: string
 |    |    |    |    |-- value: string (valueContainsNull = true)



In [None]:
'''
  processing reserved.json (was saved in ComplexToSimple.scala)

'''

resDf = spark.read.json("../reserved.json", multiLine = True)
resDf.show(5)

+--------------+--------------------+--------------------+---------------------+--------------------+-------------+
|confirmed_city|             details|            event_id|total_confirmed_slots|total_waitlist_slots|waitlist_city|
+--------------+--------------------+--------------------+---------------------+--------------------+-------------+
|         Delhi|[[3485 Trailsway ...|da347978-162e-4f5...|                    3|                   5|        Delhi|
|          null|                null|c12d9de6-82a8-417...|                 null|                   5|        Delhi|
|     Bengaluru|[[354 8th Crossin...|002ce8ce-3609-490...|                    4|                   4|    Bengaluru|
|          null|                null|661db387-0bed-435...|                 null|                   5|      Kolkata|
|       Chennai|[[20159 Pepper Wo...|9dbd178d-c4f4-4ae...|                    5|                   2|      Chennai|
+--------------+--------------------+--------------------+--------------

In [None]:
def processConfirmPart(resDf):
  df = resDf.select(col('event_id'), col('details'), col('confirmed_city'), col('total_confirmed_slots'))\
      .where(col("confirmed_city").isNotNull())\
      .drop(col('total_confirmed_slots'))\
      .select(col('event_id'), col('confirmed_city').alias("city"), explode(col('details')).alias("detail"))\
      .select(col('event_id'), struct(col('city'), col("detail.address"), col("detail.slots")).alias("confirmed"))\
      .groupBy(col('event_id')).agg(collect_list(col('confirmed')).alias("confirmed"))
  return df  
  
  
def processWaitlistPart(resDf):
  df = resDf.select(col('event_id'),col('waitlist_city').alias("city"),col('total_waitlist_slots').alias("slots"))\
      .where(col("waitlist_city").isNotNull())\
      .select(col('event_id'),struct(col('city'),col('slots')).alias("detail"))\
      .groupBy(col('event_id')).agg(collect_list(col('detail')).alias("waitlist"))
  return df
  

In [None]:
confirmedDf = processConfirmPart(resDf)
confirmedDf.show(5)
confirmedDf.printSchema()

+--------------------+--------------------+
|            event_id|           confirmed|
+--------------------+--------------------+
|66729580-d036-475...|[[Delhi, 83403 Do...|
|522ef782-1a56-466...|[[Bengaluru, 0415...|
|d644ef66-fe38-4d5...|[[Mumbai, 18969 H...|
|c12d9de6-82a8-417...|[[Kolkata, 061 My...|
|0f815487-fddc-415...|[[Mumbai, 0308 Hu...|
+--------------------+--------------------+
only showing top 5 rows

root
 |-- event_id: string (nullable = true)
 |-- confirmed: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- slots: long (nullable = true)



In [None]:
waitlistDf = processWaitlistPart(resDf)
waitlistDf.show(5)
waitlistDf.printSchema()

+--------------------+--------------------+
|            event_id|            waitlist|
+--------------------+--------------------+
|66729580-d036-475...|[[Kolkata, 1], [D...|
|d644ef66-fe38-4d5...|[[Chennai, 5], [M...|
|522ef782-1a56-466...|[[Bengaluru, 1], ...|
|c12d9de6-82a8-417...|[[Delhi, 5], [Kol...|
|0f815487-fddc-415...|[[Delhi, 3], [Mum...|
+--------------------+--------------------+
only showing top 5 rows

root
 |-- event_id: string (nullable = true)
 |-- waitlist: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- slots: long (nullable = true)



In [None]:
resDf = confirmedDf.join(waitlistDf, waitlistDf["event_id"] == confirmedDf["event_id"])\
      .drop(confirmedDf["event_id"])\
      .select(col('event_id'), struct(col('confirmed'), col('waitlist')).alias("reserved"))

print("consolidated reserved Schema .........................................")
resDf.show(5)
resDf.printSchema()


consolidated reserved Schema .........................................
+--------------------+--------------------+
|            event_id|            reserved|
+--------------------+--------------------+
|66729580-d036-475...|[[[Delhi, 83403 D...|
|522ef782-1a56-466...|[[[Bengaluru, 041...|
|d644ef66-fe38-4d5...|[[[Mumbai, 18969 ...|
|c12d9de6-82a8-417...|[[[Kolkata, 061 M...|
|0f815487-fddc-415...|[[[Mumbai, 0308 H...|
+--------------------+--------------------+
only showing top 5 rows

root
 |-- event_id: string (nullable = true)
 |-- reserved: struct (nullable = false)
 |    |-- confirmed: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- city: string (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |    |-- slots: long (nullable = true)
 |    |-- waitlist: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- city: string (nullable = true)
 |    |    |    |-- s

In [None]:
finalDf = resDf.join(schDf, schDf["event_id"] == resDf["event_id"])\
      .select(schDf["event_id"], col("reserved"), col('schedule'))\
      .select(struct(col('event_id'), col("schedule.schedule"), col('reserved')).alias("event_data"))

print("Final Output .........................................")
finalDf.show(5)
finalDf.printSchema()

Final Output .........................................
+--------------------+
|          event_data|
+--------------------+
|[66729580-d036-47...|
|[522ef782-1a56-46...|
|[d644ef66-fe38-4d...|
|[c12d9de6-82a8-41...|
|[0f815487-fddc-41...|
+--------------------+
only showing top 5 rows

root
 |-- event_data: struct (nullable = false)
 |    |-- event_id: string (nullable = true)
 |    |-- schedule: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: array (valueContainsNull = true)
 |    |    |    |-- element: map (containsNull = true)
 |    |    |    |    |-- key: string
 |    |    |    |    |-- value: string (valueContainsNull = true)
 |    |-- reserved: struct (nullable = false)
 |    |    |-- confirmed: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- city: string (nullable = true)
 |    |    |    |    |-- address: string (nullable = true)
 |    |    |    |    |-- slots: long (nullable = true)
 |    |    |--