In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.my_pyspark import *
from modules.my_drawer import MyDrawer
from pyspark.sql.functions import *

In [3]:
spark = MyPySpark(session=True, sql=True)
drawer = MyDrawer()

#### Task 1

In [4]:
file_path = './data/AA_data'

In [5]:
df = spark.readFile(file_path)

#### Task 2

In [6]:
df.count()

583718

In [7]:
df.printSchema()

root
 |-- Date (MM/DD/YYYY): string (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



In [8]:
df.show(5)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|       01/01/2014|            5|                HNL|                          519|
|       01/01/2014|            7|                OGG|                          505|
|       01/01/2014|           35|                SLC|                          174|
|       01/01/2014|           43|                DTW|                          153|
|       01/01/2014|           52|                PIT|                          137|
+-----------------+-------------+-------------------+-----------------------------+
only showing top 5 rows



#### Task 3

_Kiểm tra dữ liệu NaN_

In [9]:
mask = [count(when(isnan(c), c)).alias(c) for c in df.columns]

In [10]:
df.select(mask).show(5)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|                0|            0|                  0|                            0|
+-----------------+-------------+-------------------+-----------------------------+



> * Ko có NaN value

_Kiểm tra dữ liệu null_

In [11]:
mask1 = [count(when(col(c).isNull(), c)).alias(c) for c in df.columns]

In [12]:
df.select(mask1).show()

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|                0|            0|                  0|                            0|
+-----------------+-------------+-------------------+-----------------------------+



* Ko có giá trị null

#### Task 4

In [13]:
df.count()

583718

In [14]:
df.distinct().count()

583718

> * Ko có duplicate

#### Task 5

In [15]:
df = df.withColumn('airport', lower('Destination Airport'))

In [16]:
df = df.drop('Destination Airport')

In [17]:
df.show(5)

+-----------------+-------------+-----------------------------+-------+
|Date (MM/DD/YYYY)|Flight Number|Actual elapsed time (Minutes)|airport|
+-----------------+-------------+-----------------------------+-------+
|       01/01/2014|            5|                          519|    hnl|
|       01/01/2014|            7|                          505|    ogg|
|       01/01/2014|           35|                          174|    slc|
|       01/01/2014|           43|                          153|    dtw|
|       01/01/2014|           52|                          137|    pit|
+-----------------+-------------+-----------------------------+-------+
only showing top 5 rows



#### Task 6

In [18]:
df = df.withColumn('date', df['Date (MM/DD/YYYY)'])

In [19]:
df = df.drop('Date (MM/DD/YYYY)')

In [20]:
df.show(5)

+-------------+-----------------------------+-------+----------+
|Flight Number|Actual elapsed time (Minutes)|airport|      date|
+-------------+-----------------------------+-------+----------+
|            5|                          519|    hnl|01/01/2014|
|            7|                          505|    ogg|01/01/2014|
|           35|                          174|    slc|01/01/2014|
|           43|                          153|    dtw|01/01/2014|
|           52|                          137|    pit|01/01/2014|
+-------------+-----------------------------+-------+----------+
only showing top 5 rows



#### Task 7

In [21]:
df = df.withColumnRenamed('Flight Number', 'flight_num')
df = df.withColumnRenamed('Actual elapsed time (Minutes)', 'actual_time')

In [22]:
df.show(5)

+----------+-----------+-------+----------+
|flight_num|actual_time|airport|      date|
+----------+-----------+-------+----------+
|         5|        519|    hnl|01/01/2014|
|         7|        505|    ogg|01/01/2014|
|        35|        174|    slc|01/01/2014|
|        43|        153|    dtw|01/01/2014|
|        52|        137|    pit|01/01/2014|
+----------+-----------+-------+----------+
only showing top 5 rows



#### Task 8

In [26]:
file_path1 = './data/AA_DFW_ALL.parquet'

In [27]:
df.write.parquet(file_path1, mode='overwrite')

#### Task 9

In [28]:
df_new = spark.readFile(file_path1, 'parquet')

#### Task 10

_Tạo view `flights`_

In [29]:
view = 'flights'

In [30]:
df_new.createOrReplaceTempView(view)

In [31]:
query = '''
    select avg(f.actual_time)
    from flights as f
'''

In [33]:
spark.sqlQuery(query).show()

+------------------+
|  avg(actual_time)|
+------------------+
|147.59399915712726|
+------------------+



In [40]:
spark.sqlQuery(query).collect()[0][0]

147.59399915712726