In [1]:
import findspark
findspark.init()

from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T

import pandas as pd
pd.set_option('display.max_rows', 1000)

from etl import SparkETL

In [2]:
etl = SparkETL()
spark = etl.get_spark()

22/05/15 13:31:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/15 13:31:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/15 13:31:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/15 13:31:58 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
airport_schema = T.StructType([
    T.StructField('ident', T.StringType(), True),
    T.StructField('type', T.StringType(), True),
    T.StructField('name', T.StringType(), True),
    T.StructField('elevation_ft', T.StringType(), True),
    T.StructField('continent', T.StringType(), True),
    T.StructField('iso_country', T.StringType(), True),
    T.StructField('iso_region', T.StringType(), True),
    T.StructField('municipality', T.StringType(), True),
    T.StructField('gps_code', T.StringType(), True),
    T.StructField('iata_code', T.StringType(), True),
    T.StructField('local_code', T.StringType(), True),
    T.StructField('coordinates', T.StringType(), True)
])

In [4]:
airport_staging = (
    spark
    .read
    .format('csv')
    .schema(airport_schema)
    .option('header', 'true')
    .load(etl.data_sources['airports'])
)

In [5]:
def filter_us_iso_countries(df):
    
    us_iso_countries_pd = pd.DataFrame(
        {'iso_country2': ['US', 'AS', 'FM', 'GU', 'MH', 'MP', 'PR', 'PW', 'VI']}
    )
    
    us_iso_countries = spark.createDataFrame(us_iso_countries_pd)
    
    return (
        df
        .join(
            us_iso_countries,
            on=df['iso_country'] == us_iso_countries['iso_country2'],
            how='inner'
        )
        .drop('iso_country2')
    )

In [6]:
def project_state(df):
    return (
        df
        .withColumn('state_id', F.expr("""
            IF(
                SUBSTR(iso_region, 0, 2) = 'US',
                SUBSTR(iso_region, 4),
                SUBSTR(iso_region, 0, 2)
            )
        """))
    )

In [7]:
def project_type_id(df):
    return df.withColumn('type_id', F.expr("""
                            CASE type
                                WHEN 'closed' THEN 0
                                WHEN 'balloonport' THEN 1
                                WHEN 'heliport' THEN 2
                                WHEN 'seaplane_base' THEN 3
                                WHEN 'small_airport' THEN 4
                                WHEN 'medium_airport' THEN 5
                                WHEN 'large_airport' THEN 6
                            END
                        """)
                         )

In [8]:
def project_international(df):
    return df.withColumn(
        'international',
        F.expr("LOWER(name) LIKE '%international%'")
    )

In [15]:
def project_schema(df):
    return (
        df
        .select(
            F.col('ident').alias('airport_id'),
            F.col('iata_code').alias('airport_iata'),
            'state_id',
            F.col('municipality').alias('city'),
            'name',
            'international',
            'type_id',
            'type',
            'coordinates'
        )
    )

In [16]:
def clean_airport(df):
    return (
        airport_staging
        .pipe(filter_us_iso_countries)
        .pipe(project_state)
        .pipe(project_type_id)
        .pipe(project_international)
        .pipe(project_schema)
    )

In [17]:
etl.save_clean_table(airport_staging.pipe(clean_airport), 'airport')

In [18]:
etl.read_clean_table('airport').toPandas()

Unnamed: 0,airport_id,airport_iata,state_id,city,name,international,type_id,type,coordinates
0,ZNC,ZNC,AK,Nyac,Nyac Airport,False,4,small_airport,"-159.994003296, 60.9807014465"
1,Z95,,AZ,Cibecue,Cibecue Airport,False,4,small_airport,"-110.44400024414062, 34.003299713134766"
2,Z93,CZC,AK,Copper Center,Copper Center 2 Airport,False,4,small_airport,"-145.294006348, 61.9412002563"
3,Z92,,MI,Harsens Island,Harsens Island Airport,False,4,small_airport,"-82.57640075683594, 42.589698791503906"
4,Z91,KBC,AK,Birch Creek,Birch Creek Airport,False,4,small_airport,"-145.824005127, 66.2740020752"
...,...,...,...,...,...,...,...,...,...
22887,NSAS,OFU,AS,Ofu Village,Ofu Village Airport,False,4,small_airport,"-169.669998, -14.1844"
22888,AS-TAV,TAV,AS,Tau Village,Tau Airport,False,4,small_airport,"-169.511001587, -14.2292003632"
22889,PTRO,ROR,PW,Babelthuap Island,Babelthuap Airport,False,5,medium_airport,"134.544236, 7.36731"
22890,C23,,PW,Babelthuap Island,Peleliu Airport,False,4,small_airport,"134.23300170898438, 7"
