In [2]:
from pyspark.sql.functions import expr
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName('test')
         .getOrCreate()
        )

tripdelaysFilePath = 'departuredelays.csv'
airportsnaFilePath = 'airport-codes-na.txt'

airportsna = (spark.read
              .format('csv')
              .options(header='true', inferSchema='true',sep='\t')
              .load(airportsnaFilePath))

airportsna.createOrReplaceTempView('airport_na')

departureDelays = (spark.read
                   .format('csv')
                   .options(header = 'true')
                   .load(tripdelaysFilePath))

departureDelays = (departureDelays
                   .withColumn('delay', expr('CAST(delay as INT) as delay'))
                   .withColumn('distance', expr('CAST(distance as INT) as distance')))

departureDelays.createOrReplaceTempView('departureDelays')

foo = (departureDelays
       .filter(expr("""origin == 'SEA' and destination == 'SFO' and date like '01010%' and delay > 0""")))

foo.createOrReplaceTempView('foo')

In [3]:
spark.sql('select * from airport_na limit 10').show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
+-----------+-----+-------+----+



In [4]:
spark.sql('select * from departureDelays limit 10').show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+



Union

In [5]:
bar =departureDelays.union(foo)
bar.createOrReplaceTempView('bar')

bar.filter(expr("""origin == 'SEA' and destination =='SFO'
                and date like '01010%' and delay >0 """)).show()


+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [6]:
spark.sql('''
        select *
        from bar
        where origin = 'SEA'
          and destination = 'SFO'
          and date like '01010%'
          and delay > 0''').show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



JOIN

In [9]:
foo.join(
    airportsna,
    airportsna.IATA == foo.origin
).select('City','State','date','delay','distance','destination').show()

+-------+-----+--------+-----+--------+-----------+
|   City|State|    date|delay|distance|destination|
+-------+-----+--------+-----+--------+-----------+
|Seattle|   WA|01010710|   31|     590|        SFO|
|Seattle|   WA|01010955|  104|     590|        SFO|
|Seattle|   WA|01010730|    5|     590|        SFO|
+-------+-----+--------+-----+--------+-----------+



In [11]:
spark.sql('''
        select a.City, f.date, f.delay, f.distance, f.destination
        from foo f join airport_na a on a.IATA = f.origin''').show()

+-------+--------+-----+--------+-----------+
|   City|    date|delay|distance|destination|
+-------+--------+-----+--------+-----------+
|Seattle|01010710|   31|     590|        SFO|
|Seattle|01010955|  104|     590|        SFO|
|Seattle|01010730|    5|     590|        SFO|
+-------+--------+-----+--------+-----------+



Window

In [20]:
departureDelaysWindow = spark.sql('''select origin, destination, sum(delay) as TotalDelays
                                    from departureDelays
                                    where origin IN ('SEA','SFO','JFK')
                                        and destination IN ('SEA','SFO','JFK','DEN', 'ORD', 'LAX', 'ATL')
                                    group by origin, destination''')

departureDelaysWindow.createOrReplaceTempView('departureDelaysWindow')

rank 함수

In [23]:
spark.sql('''
        select origin, destination, TotalDelays, rank
        from (
            select origin, destination, TotalDelays,
                    dense_rank() OVER (PARTITION BY origin ORDER BY TotalDelays DESC) as rank
            from departureDelaysWindow) t
        where rank <= 3''').show()

+------+-----------+-----------+----+
|origin|destination|TotalDelays|rank|
+------+-----------+-----------+----+
|   JFK|        LAX|      35755|   1|
|   JFK|        SFO|      35619|   2|
|   JFK|        ATL|      12141|   3|
|   SEA|        SFO|      22293|   1|
|   SEA|        DEN|      13645|   2|
|   SEA|        ORD|      10041|   3|
|   SFO|        LAX|      40798|   1|
|   SFO|        ORD|      27412|   2|
|   SFO|        JFK|      24100|   3|
+------+-----------+-----------+----+



<h3>수정

In [24]:
foo.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



열 추가

In [25]:
foo2 = (foo.withColumn('status', expr('CASE WHEN delay < 10 THEN "On-time" ELSE "Delayed" END ')))
foo2.show()

+--------+-----+--------+------+-----------+-------+
|    date|delay|distance|origin|destination| status|
+--------+-----+--------+------+-----------+-------+
|01010710|   31|     590|   SEA|        SFO|Delayed|
|01010955|  104|     590|   SEA|        SFO|Delayed|
|01010730|    5|     590|   SEA|        SFO|On-time|
+--------+-----+--------+------+-----------+-------+



열 삭제

In [26]:
foo3 = foo2.drop('delay')
foo3.show()

+--------+--------+------+-----------+-------+
|    date|distance|origin|destination| status|
+--------+--------+------+-----------+-------+
|01010710|     590|   SEA|        SFO|Delayed|
|01010955|     590|   SEA|        SFO|Delayed|
|01010730|     590|   SEA|        SFO|On-time|
+--------+--------+------+-----------+-------+



컬럼명 바꾸기

In [27]:
foo4 = foo3.withColumnRenamed('status','flight_status')
foo4.show()

+--------+--------+------+-----------+-------------+
|    date|distance|origin|destination|flight_status|
+--------+--------+------+-----------+-------------+
|01010710|     590|   SEA|        SFO|      Delayed|
|01010955|     590|   SEA|        SFO|      Delayed|
|01010730|     590|   SEA|        SFO|      On-time|
+--------+--------+------+-----------+-------------+



In [28]:
spark.stop()