In [19]:
import findspark
findspark.init()

from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T

import pandas as pd
pd.set_option('display.max_rows', 1000)

from etl import SparkETL

In [26]:
etl = SparkETL()
spark = etl.get_spark()

In [14]:
airport_schema = T.StructType([
    T.StructField('ident', T.StringType(), True),
    T.StructField('type', T.StringType(), True),
    T.StructField('name', T.StringType(), True),
    T.StructField('elevation_ft', T.StringType(), True),
    T.StructField('continent', T.StringType(), True),
    T.StructField('iso_country', T.StringType(), True),
    T.StructField('iso_region', T.StringType(), True),
    T.StructField('municipality', T.StringType(), True),
    T.StructField('gps_code', T.StringType(), True),
    T.StructField('iata_code', T.StringType(), True),
    T.StructField('local_code', T.StringType(), True),
    T.StructField('coordinates', T.StringType(), True)
])

In [28]:
airport_staging = (
    spark
    .read
    .format('csv')
    .schema(airport_schema)
    .option('header', 'true')
    .load(etl.data_sources['airports'])
)

In [49]:
def filter_us_iso_countries(df):
    
    us_iso_countries_pd = pd.DataFrame(
        {'iso_country2': ['US', 'AS', 'FM', 'GU', 'MH', 'MP', 'PR', 'PW', 'VI']}
    )
    
    us_iso_countries = spark.createDataFrame(us_iso_countries_pd)
    
    return (
        df
        .join(
            us_iso_countries,
            on=df['iso_country'] == us_iso_countries['iso_country2'],
            how='inner'
        )
        .drop('iso_country2')
    )

In [62]:
def project_state(df):
    return (
        df
        .withColumn('state_id', F.expr("""
            IF(
                SUBSTR(iso_region, 0, 2) = 'US',
                SUBSTR(iso_region, 4),
                SUBSTR(iso_region, 0, 2)
            )
        """))
    )

In [69]:
def project_schema(df):
    return (
        df
        .select(
            F.col('ident').alias('airport_id'),
            'state_id',
            F.col('municipality').alias('city'),
            'name',
            'type',
            'coordinates'
        )
    )

In [71]:
def clean_airport(df):
    return (
        airport_staging
        .pipe(filter_us_iso_countries)
        .pipe(project_state)
        .pipe(project_schema)
    )

In [72]:
etl.save_clean_table(airport_staging.pipe(clean_airport), 'airport')

22/05/06 14:05:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
22/05/06 14:05:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
22/05/06 14:05:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
22/05/06 14:05:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
22/05/06 14:05:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [73]:
etl.read_clean_table('airport').toPandas()

Unnamed: 0,airport_id,state_id,city,name,type,coordinates
0,ZNC,AK,Nyac,Nyac Airport,small_airport,"-159.994003296, 60.9807014465"
1,Z95,AZ,Cibecue,Cibecue Airport,small_airport,"-110.44400024414062, 34.003299713134766"
2,Z93,AK,Copper Center,Copper Center 2 Airport,small_airport,"-145.294006348, 61.9412002563"
3,Z92,MI,Harsens Island,Harsens Island Airport,small_airport,"-82.57640075683594, 42.589698791503906"
4,Z91,AK,Birch Creek,Birch Creek Airport,small_airport,"-145.824005127, 66.2740020752"
...,...,...,...,...,...,...
22887,NSAS,AS,Ofu Village,Ofu Village Airport,small_airport,"-169.669998, -14.1844"
22888,AS-TAV,AS,Tau Village,Tau Airport,small_airport,"-169.511001587, -14.2292003632"
22889,PTRO,PW,Babelthuap Island,Babelthuap Airport,medium_airport,"134.544236, 7.36731"
22890,C23,PW,Babelthuap Island,Peleliu Airport,small_airport,"134.23300170898438, 7"
