In [1]:
import findspark
findspark.init()

import pyspark.sql.types as T
import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from etl import SparkETL

In [2]:
etl = SparkETL()
spark = etl.get_spark()

22/05/09 11:46:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 11:46:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/09 11:46:27 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/09 11:46:27 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/09 11:46:27 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [45]:
immigration = etl.read_clean_table('immigration')

In [4]:
def route_nk(df):
    return (
        df
        .select('airline', 'flight_number', 'port_id')
        .drop_duplicates()
    )

In [5]:
def missing_routes(df):
    
    route_dim = etl.read_dim_table('route_dim')
    
    return (
        df
        .join(
            route_dim,
            on=(
                (df['airline'] == route_dim['airline'])
                & (df['flight_number'] == route_dim['flight_number'])
                & (df['port_id'] == route_dim['port_id'])
            ),
            how='leftanti'
        )
    )

In [6]:
def coordinate_expr(index):
    return F.expr(f"""
            CAST(
                ELEMENT_AT(
                    SPLIT(dst_coordinates, ','), 
                    {index}
                ) 
                AS DOUBLE
            )
        """)

In [7]:
def fill_sk(df):
    return df.withColumn('route_id', F.monotonically_increasing_id())

In [8]:
def fill_airport(df):
    
    airports = (
        etl.read_clean_table('airport')
        .withColumnRenamed('airport_id', 'dst_airport_id')
    )
    
    ports_to_airports = (
        etl.read_clean_table('port_to_airport')
        .withColumnRenamed('port_id', 'airport_port_id')
    )
    
    return (
        df
        .join(
            ports_to_airports,
            on=df['port_id'] == ports_to_airports['airport_port_id'],
            how='left'
        )
        .drop('airport_port_id')
        .join(
            airports,
            on=ports_to_airports['airport_id'] == airports['dst_airport_id'],
            how='left'
        )
        .drop('airport_id')
        .withColumnRenamed('city', 'dst_city')
        .withColumnRenamed('state_id', 'dst_state_id')
        .withColumnRenamed('name', 'dst_airport_name')
        .withColumnRenamed('international', 'dst_airport_international')
        .withColumnRenamed('type_id', 'dst_airport_type_id')
        .withColumnRenamed('type', 'dst_airport_type')
        .withColumnRenamed('coordinates', 'dst_coordinates')
        .withColumn('dst_longitude', coordinate_expr(1))
        .withColumn('dst_latitude', coordinate_expr(2))
    )

In [9]:
def fill_demographics(df):
    
    demographics = etl.read_clean_table('demographics')
    
    return (
        df
        .join(
            demographics,
            on=(
                (df['dst_state_id'] == demographics['state_id'])
                & (df['dst_city'] == demographics['city'])
            ),
            how='left'
        )
        .drop('state_id', 'city')
        .withColumnRenamed('asian', 'dst_asian')
        .withColumnRenamed('black', 'dst_black')
        .withColumnRenamed('latino', 'dst_latino')
        .withColumnRenamed('native', 'dst_native')
        .withColumnRenamed('white', 'dst_white')
        .withColumnRenamed('ethnicity_id', 'dst_ethnicity_id')
        .withColumnRenamed('ethnicity', 'dst_ethnicity')
        .withColumnRenamed('population', 'dst_population')
        .withColumnRenamed('size_id', 'dst_size_id')
        .withColumnRenamed('size', 'dst_size')
    )

In [58]:
def fill_state(df):
    
    states = etl.read_clean_table('state')
    
    return (
        df
        .join(states, on=df['dst_state_id'] == states['state_id'], how='left')
        .drop('state_id')
        .withColumnRenamed('name', 'dst_state_name')
        .withColumnRenamed('type_id', 'dst_state_type_id')
        .withColumnRenamed('type', 'dst_state_type')
    )

In [62]:
def fill_temperature(df):
    
    temperatures = etl.read_clean_table('temperature')
    
    return (
        df
        .join(temperatures, on=df['dst_state_id'] == temperatures['state_id'], how='left')
        .drop('state_id')
        .withColumnRenamed('climate_id', 'dst_state_climate_id')
        .withColumnRenamed('climate', 'dst_state_climate')
    )

In [60]:
def fill_missing_routes(df):
    return (
        df
        .pipe(missing_routes)
        .pipe(route_nk)
        .pipe(fill_sk)
        .pipe(fill_airport)
        .pipe(fill_demographics)
        .pipe(fill_state)
        .pipe(fill_temperature)
    )

In [66]:
etl.save_dim_table(
    immigration.pipe(fill_missing_routes),
    'route_dim'
)

22/05/09 12:25:31 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
22/05/09 12:25:31 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
22/05/09 12:25:31 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [67]:
etl.read_dim_table('route_dim').toPandas()

Unnamed: 0,route_id,airline,flight_number,port_id,dst_airport_id,dst_coordinates,dst_longitude,dst_latitude,dst_city,dst_port,dst_airport_name,dst_airport_international,dst_airport_type_id,dst_airport_type,dst_asian,dst_black,dst_latino,dst_native,dst_white,dst_ethnicity_id,dst_ethnicity,dst_population,dst_size_id,dst_size,dst_state_id,dst_state_name,dst_state_type_id,dst_state_type,dst_state_climate_id,dst_state_climate
0,515396075520,LX,00008,CHI,KDPA,"-88.24859619, 41.90779877",-88.248596,41.907799,Chicago/West Chicago,,Dupage Airport,False,5.0,medium_airport,,,,,,,,,,,IL,Illinois,0.0,State,2.0,temperate
1,515396075521,FI,00615,NYC,6N6,"-73.81620025634766, 40.84590148925781",-73.8162,40.845901,New York,,Evers Seaplane Base,False,3.0,seaplane_base,0.152573,0.256391,0.290644,0.010634,0.448602,2.0,"white, latino minority",8550405.0,3.0,"very large (> 1,5M)",NY,New York,0.0,State,1.0,continental
2,515396075522,AR,01302,LOS,KLAX,"-118.4079971, 33.94250107",-118.407997,33.942501,Los Angeles,,Los Angeles International Airport,True,6.0,large_airport,0.129157,0.101933,0.487609,0.016052,0.548265,2.0,"white, latino minority",3971896.0,3.0,"very large (> 1,5M)",CA,California,0.0,State,3.0,warm
3,515396075523,WK,00006,LVG,KVGT,"-115.19400024414, 36.21070098877",-115.194,36.210701,Las Vegas,,North Las Vegas Airport,False,5.0,medium_airport,0.089368,0.136248,0.328508,0.013556,0.687982,2.0,"white, latino minority",623769.0,2.0,"large (500K - 1,5M)",NV,Nevada,0.0,State,2.0,temperate
4,515396075524,AA,731,CLT,99NA,"-80.845989, 35.060633",-80.845989,35.060633,Charlotte,,Fly I Heliport,False,2.0,heliport,0.066978,0.3646,0.137502,0.010574,0.540181,0.0,"white, black minority",827121.0,2.0,"large (500K - 1,5M)",NC,North Carolina,0.0,State,3.0,warm
5,515396075525,NH,00178,PHO,KPHX,"-112.01200103759766, 33.43429946899414",-112.012001,33.434299,Phoenix,,Phoenix Sky Harbor International Airport,True,6.0,large_airport,0.042484,0.085054,0.428608,0.02671,0.743093,2.0,"white, latino minority",1563001.0,3.0,"very large (> 1,5M)",AZ,Arizona,0.0,State,3.0,warm
6,515396075526,B6,00950,NYC,6N6,"-73.81620025634766, 40.84590148925781",-73.8162,40.845901,New York,,Evers Seaplane Base,False,3.0,seaplane_base,0.152573,0.256391,0.290644,0.010634,0.448602,2.0,"white, latino minority",8550405.0,3.0,"very large (> 1,5M)",NY,New York,0.0,State,1.0,continental
7,515396075527,BR,00018,WAS,DC52,"-77.11389923095703, 38.938899993896484",-77.113899,38.9389,Washington,,Sibley Memorial Hospital Heliport,False,2.0,heliport,0.052173,0.489099,0.105811,0.009119,0.424561,4.0,"black, white minority",672228.0,2.0,"large (500K - 1,5M)",DC,District of Columbia,1.0,Federal District,,
8,515396075528,AB,07210,OGG,,,,,,,,,,,,,,,,,,,,,,,,,,
9,326417514496,WN,00812,BAL,KBWI,"-76.668297, 39.1754",-76.668297,39.1754,Baltimore,,Baltimore/Washington International Thurgood Ma...,True,6.0,large_airport,0.032421,0.637245,0.048168,0.012431,0.333319,4.0,"black, white minority",621849.0,2.0,"large (500K - 1,5M)",MD,Maryland,0.0,State,2.0,temperate


NOTE: use entity resolution for airport_to_city

Exhibit that doesn't match any city
```
Chicago/West Chicago airport municipality
```