In [1]:
from pyspark.sql import SparkSession  # Create a SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = (
    SparkSession.builder.appName("SparkSQLExampleApp").master("local[4]").getOrCreate()
)

spark.conf.set("spark.sql.shuffle.partition", 4)
# spark.conf.set("spark.sql.session.timeZone", "UTC")

22/06/07 13:00:46 WARN Utils: Your hostname, emif-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.171 instead (on interface en0)
22/06/07 13:00:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/07 13:00:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Path to data set
csv_file = "/Users/emif/Documents/spark-lab/datasets/departuredelays.csv"

schema = """date STRING, 
            delay INT, 
            distance INT,
            origin STRING,
            destination STRING
"""

# Read and create a temporary view
df = spark.read.format("csv").schema(schema).option("header", "true").load(csv_file)
# create temporal view
df.createOrReplaceTempView("us_delay_flights_tbl")

In [4]:
df.describe().show()

                                                                                

+-------+-----------------+------------------+-----------------+-------+-----------+
|summary|             date|             delay|         distance| origin|destination|
+-------+-----------------+------------------+-----------------+-------+-----------+
|  count|          1391578|           1391578|          1391578|1391578|    1391578|
|   mean|2180446.584000322|12.079802928761449|690.5508264718184|   null|       null|
| stddev|838031.1536740973|38.807733749856475|513.6628153663352|   null|       null|
|    min|         01010005|              -112|               21|    ABE|        ABE|
|    max|         03312359|              1642|             4330|    YUM|        YUM|
+-------+-----------------+------------------+-----------------+-------+-----------+



In [5]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [6]:
df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

In [7]:
(
    df.select("delay", "origin", "destination")
    .withColumn(
        "Flight_Delays",
        F.when(F.col("delay") == 0, F.lit("No Delays"))
        .when((F.col("delay") > 0) & (F.col("delay") < 60), F.lit("Tolerable Delays"))
        .when((F.col("delay") > 60) & (F.col("delay") < 120), F.lit("Short Delays"))
        .when((F.col("delay") > 120) & (F.col("delay") < 360), F.lit("Long Delays"))
        .when(F.col("delay") > 360, F.lit("Very Long Delays"))
        .otherwise(F.lit("Early")),
    )
    .orderBy(F.col("origin").asc(), F.col("delay").desc())
    .show(10)
)

                                                                                

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



In [8]:
spark.sql(
    """SELECT   delay, 
                origin, 
                destination,
                CASE
                    WHEN delay > 360 THEN 'Very Long Delays'
                    WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
                    WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
                    WHEN delay > 0 and delay < 60  THEN  'Tolerable Delays'
                    WHEN delay = 0 THEN 'No Delays'
                    ELSE 'Early'
                END AS Flight_Delays
        FROM us_delay_flights_tbl
        ORDER BY origin, delay DESC
    """
).show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



22/06/07 15:46:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 270363 ms exceeds timeout 120000 ms
22/06/07 15:46:17 WARN SparkContext: Killing executors is not supported by current scheduler.
