# Use Case 1 - Summary of All Flight

## Initialization

In [8]:
import os, json
import pandas as pd
import pprint
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import IntegerType, StringType, FloatType, DoubleType, ArrayType, LongType
from pyspark.sql.functions import col, concat, concat_ws, max, min, explode, arrays_zip, to_timestamp, expr, unix_timestamp, lit

## Assumption

##### 1. One file is 1 flight id, so there is no way multiple flight in single file
##### 2. All files are json format and located under specific folder

## Load Dataset

In [2]:
# Initialize Spark Session
sparkMaster = 'spark://192.168.1.4:7077'
sparkAppName = 'Py-RevalueNature-Case2'
spark = SparkSession \
    .builder \
    .master(sparkMaster) \
    .appName(sparkAppName) \
    .getOrCreate()

In [3]:
# Read all dataset
folder_path = 'D:\\00 Project\\00 My Project\\Dataset\\Revalue Nature\\Case 2\\' # Assuming this where all json file will be stored
file_name = '*.json' # Naming file convention
df_source = spark.read.json(folder_path + file_name, multiLine=True) # Read all json files
df_source.printSchema()

root
 |-- alt: long (nullable = true)
 |-- centre_ctrl: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- centre_id: long (nullable = true)
 |    |    |-- start_time: string (nullable = true)
 |-- centre_id: string (nullable = true)
 |-- fpl: struct (nullable = true)
 |    |-- fpl_arr: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- approach_clearance: boolean (nullable = true)
 |    |    |    |-- arrival_runway: string (nullable = true)
 |    |    |    |-- ata: string (nullable = true)
 |    |    |    |-- missed_approach_flag: boolean (nullable = true)
 |    |    |    |-- star: string (nullable = true)
 |    |    |    |-- time_stamp: string (nullable = true)
 |    |-- fpl_base: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- adar: string (nullable = true)
 |    |    |    |-- adep: string (nullable = true)
 |    |    |    |-- ades: string (nullable = tr

## Extract Data Function

In [9]:
def extract_data(df_source):
    # Extract important columns
    df_result1 = (df_source
                  .withColumn('id', col('id')) \
                  .withColumn('list_altitude', col('plots.I062/380.subitem7.altitude').cast(ArrayType(DoubleType())))
                  .withColumn('list_baro_vert_rate', col('plots.I062/380.subitem13.baro_vert_rate').cast(ArrayType(DoubleType())))
                  .withColumn('list_mach', col('plots.I062/380.subitem27.mach').cast(ArrayType(DoubleType())))
                  .withColumn('list_measured_flight_level', col('plots.I062/136.measured_flight_level').cast(ArrayType(DoubleType())))
                  .withColumn('list_start_time', col('centre_ctrl.start_time').cast(ArrayType(StringType())))
                  .withColumn('list_time_of_track', col('plots.time_of_track').cast(ArrayType(StringType())))
                  .withColumn('list_aircraft_type', col('fpl.fpl_base.aircraft_type'))
                  .withColumn('list_flight_rules', col('fpl.fpl_base.flight_rules'))
                )

    # Transform into new columns
    df_result2 = (df_result1
                  .withColumn("tmp", arrays_zip("list_altitude", "list_baro_vert_rate", "list_mach", "list_measured_flight_level", "list_time_of_track")) # ZIP array columns
                  .withColumn("tmp", explode("tmp")) # explode the array
                  .withColumn('plots_max_altitude', max(col("tmp.list_altitude")).over(Window.partitionBy('id'))) # get max altitude by id
                  .withColumn('plots_max_baro_vert_rate', max(col("tmp.list_baro_vert_rate")).over(Window.partitionBy('id'))) # get max baro_vert_rate by id
                  .withColumn('plots_max_mach', max(col("tmp.list_mach")).over(Window.partitionBy('id'))) # get max mach by id
                  .withColumn('plots_max_measured_flight_level', max(col("tmp.list_measured_flight_level")).over(Window.partitionBy('id'))) # get max measured_flight_level by id
                  .withColumn('end_time', max(to_timestamp(col("tmp.list_time_of_track"))).over(Window.partitionBy('id'))) # get end time
                  .withColumn("start_time", to_timestamp(df_result1["list_start_time"].getItem(0))) # get start time
                  .withColumn("duration_s", unix_timestamp("end_time") - unix_timestamp("start_time")) # get durations in second
                  .withColumn("duration_h", concat(expr("duration_s / 3600").cast(IntegerType()), lit("h")))
                  .withColumn("duration_m", concat(expr("(duration_s % 3600) / 60").cast(IntegerType()), lit("m")))
                  .withColumn("duration_s", concat(expr("duration_s % 60").cast(IntegerType()), lit("s")))
                  .withColumn("plots_duration", concat_ws(' ',col("duration_h"), col("duration_m"), col("duration_s"))) # get all durations
                  .withColumn('aircraft_type', df_result1["list_aircraft_type"].getItem(0)) # get aircraft type
                  .withColumn('flight_rules', df_result1["list_flight_rules"].getItem(0)) # get aircraft type
                 )
    
    df_result2 = df_result2.select('id', 'plots_duration', 'plots_max_altitude', 'plots_max_baro_vert_rate', 'plots_max_mach', 'plots_max_measured_flight_level', 'aircraft_type', 'flight_rules') \
                .distinct()
    return df_result2

df_test_input = spark.read.json(folder_path + '10000*.json', multiLine=True) # Read all json files
df_test_result = extract_data(df_test_input)
print(df_test_result.show())

+------+--------------+------------------+------------------------+--------------+-------------------------------+-------------+------------+
|    id|plots_duration|plots_max_altitude|plots_max_baro_vert_rate|plots_max_mach|plots_max_measured_flight_level|aircraft_type|flight_rules|
+------+--------------+------------------+------------------------+--------------+-------------------------------+-------------+------------+
|100000|     0h 6m 26s|           33000.0|                  3325.0|         0.728|                          252.0|         B738|           I|
|100001|     0h 32m 5s|           35000.0|                  381.25|         0.848|                          350.0|         B772|           I|
|100002|     0h 47m 0s|           41000.0|                   162.5|         0.808|                          410.0|         B737|           I|
|100003|    0h 49m 52s|           37800.0|                 3006.25|         0.792|                          340.0|         CRJ9|           I|
|10000

## Prepare Final Result

In [16]:
def prepare_result_flights(df_source):
    # Select flights columns
    df_flights = df_source.select('id', 'plots_duration', 'plots_max_altitude', 'plots_max_baro_vert_rate', 'plots_max_mach', 'plots_max_measured_flight_level')
    # print(df_flights.show())
    
    # Convert to json format
    json_flights = df_flights.toJSON().collect()
    return json_flights

def prepare_result_aircraft_type(df_source):
    # Select unique_aircraft_type columns
    df_aircraft_type = df_source.select('aircraft_type')
    # print(df_aircraft_type.show())
    
    # Convert to json format
    list_unique_aircraft_type = df_aircraft_type.rdd.flatMap(lambda x: x).collect()
    return list_unique_aircraft_type

def prepare_result_flight_rules(df_source):
    # Select unique_flight_rules columns
    df_flight_rules = df_source.select('flight_rules')
    # print(df_flight_rules.show())
    
    # Convert to json format
    list_unique_flight_rules = df_flight_rules.rdd.flatMap(lambda x: x).collect()
    return list_unique_flight_rules

def build_dict(flights, aircraft_type, flight_rules):
    # Initialize Dictionary Result
    dict_result = {"flights": '', "unique_aircraft_type": '', "unique_flight_rules": ''}
    
    # Test dictionary
    dict_result["flights"] = flights
    dict_result["unique_aircraft_type"] = aircraft_type
    dict_result["unique_flight_rules"] = flight_rules

    return dict_result

# This is for testing purpose
df_flights = prepare_result_flights(df_test_result)
df_aircraft_type = prepare_result_aircraft_type(df_test_result)
df_flight_rules = prepare_result_flight_rules(df_test_result)
dict_result = build_dict(df_flights, df_aircraft_type, df_flight_rules)
pprint.pprint(dict_result)

{'flights': ['{"id":100000,"plots_duration":"0h 6m '
             '26s","plots_max_altitude":33000.0,"plots_max_baro_vert_rate":3325.0,"plots_max_mach":0.728,"plots_max_measured_flight_level":252.0}',
             '{"id":100001,"plots_duration":"0h 32m '
             '5s","plots_max_altitude":35000.0,"plots_max_baro_vert_rate":381.25,"plots_max_mach":0.848,"plots_max_measured_flight_level":350.0}',
             '{"id":100002,"plots_duration":"0h 47m '
             '0s","plots_max_altitude":41000.0,"plots_max_baro_vert_rate":162.5,"plots_max_mach":0.808,"plots_max_measured_flight_level":410.0}',
             '{"id":100003,"plots_duration":"0h 49m '
             '52s","plots_max_altitude":37800.0,"plots_max_baro_vert_rate":3006.25,"plots_max_mach":0.792,"plots_max_measured_flight_level":340.0}',
             '{"id":100004,"plots_duration":"0h 18m '
             '45s","plots_max_altitude":19000.0,"plots_max_baro_vert_rate":993.75,"plots_max_mach":0.408,"plots_max_measured_flight_level":19

## Extract and Transforms All Data

In [17]:
df_extraction = extract_data(df_source)
print("Total Data: ",df_extraction.count())

Total Data:  13138


In [23]:
df_flights = prepare_result_flights(df_extraction)
print("Done Preparing Flights Table")

Done Preparing Flights Table


In [24]:
df_aircraft_type = prepare_result_aircraft_type(df_extraction)
print("Done Preparing Aircraft Type Table")

Done Preparing Aircraft Type Table


In [25]:
df_flight_rules = prepare_result_flight_rules(df_extraction)
print("Done Preparing Flight Rules Table")

Done Preparing Flight Rules Table


In [26]:
dict_result = build_dict(df_flights, df_aircraft_type, df_flight_rules)
print("Done Preparing Final Dictionary")

Done Preparing Final Dictionary


In [27]:
pprint.pprint(dict_result)

{'flights': ['{"id":100012,"plots_duration":"0h 49m '
             '47s","plots_max_altitude":37000.0,"plots_max_baro_vert_rate":543.75,"plots_max_mach":0.792,"plots_max_measured_flight_level":370.25}',
             '{"id":100016,"plots_duration":"0h 44m '
             '44s","plots_max_altitude":37000.0,"plots_max_baro_vert_rate":418.75,"plots_max_mach":0.8,"plots_max_measured_flight_level":370.0}',
             '{"id":100017,"plots_duration":"1h 2m '
             '5s","plots_max_altitude":16000.0,"plots_max_baro_vert_rate":225.0,"plots_max_mach":0.44,"plots_max_measured_flight_level":160.25}',
             '{"id":100030,"plots_duration":"1h 6m '
             '19s","plots_max_baro_vert_rate":1981.25,"plots_max_mach":0.504,"plots_max_measured_flight_level":190.0}',
             '{"id":100031,"plots_duration":"0h 16m '
             '59s","plots_max_altitude":34000.0,"plots_max_baro_vert_rate":162.5,"plots_max_mach":0.856,"plots_max_measured_flight_level":340.0}',
             '{"id":1000

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

