In [1]:
import findspark
findspark.init()

import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

from etl import SparkETL

In [2]:
etl = SparkETL()
spark = etl.get_spark()

22/05/09 14:01:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 14:01:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
immigration = etl.read_clean_table('immigration')

In [4]:
visitor_dim_nk = ['citizenship_id', 'residence_id', 'age_id', 'gender_id', 'visa_id', 'stay_id', 'address_id']

In [5]:
def aggregates(df):
    return (
        df
        .withColumn('day', F.expr("DAY(arrival_date)"))
        .groupby(
            
            'arrival_date', # time_dim nk
            
            'airline', 'flight_number', 'port_id', # route_dim nk
            
            *visitor_dim_nk
        )
        .agg(
            F.count('count').alias('num_visitors'),
            F.avg('age').alias('age_avg'),
            F.stddev('age').alias('age_std'),
            F.avg('stay').alias('stay_avg'),
            F.stddev('stay').alias('stay_std'),
        )
    )

In [6]:
def replace_null(colname, value):
    return F.expr(f"""
            IF(
                {colname} IS NULL,
                {value},
                {colname}
            )
            """)



In [7]:
def replace_null_stddev(df, colname):
    return df.withColumn(
        colname,
        replace_null(colname, 0.0)
    )

In [8]:
def project_time_sk(df):
    
    time_dim = etl.read_dim_table('time_dim')
    
    return (
        df
        .join(
            time_dim.select('time_id', 'date'),
            on=df['arrival_date'] == time_dim['date'],
            how='inner'
        )
        .drop('arrival_date', 'date')
    )

In [9]:
def project_route_sk(df):
    
    route_dim = etl.read_dim_table('route_dim')
    
    return (
        df
        .join(
            route_dim.select('route_id', 'airline', 'flight_number', 'port_id'),
            on=(
                (df['airline'] == route_dim['airline'])
                & (df['flight_number'] == route_dim['flight_number'])
                & (df['port_id'] == route_dim['port_id'])
            ),
            how='inner'
        )
        .drop('airline', 'flight_number', 'port_id')
    )

In [10]:
def project_visitor_sk_on(df, visitor_dim, keys=visitor_dim_nk):
    result = (df[visitor_dim_nk[0]] == visitor_dim[visitor_dim_nk[0]])
    for col in keys[1:]:
        result = result & (df[col] == visitor_dim[col])
    return result

In [11]:
def project_visitor_sk(df):
    
    visitor_dim = etl.read_dim_table('foreign_visitor_dim')
    
    return (
        df
        .join(
            visitor_dim.select('visitor_id', *visitor_dim_nk),
            project_visitor_sk_on(df, visitor_dim, visitor_dim_nk),
            'inner'
        )
        .drop(*visitor_dim_nk)
    )

In [19]:
def project_schema(df):
    
    column_order = ['time_id', 'route_id', 'visitor_id', 'num_visitors', 'age_avg', 'age_std', 'stay_avg', 'stay_std']
    
    return df.select(*column_order)

In [20]:
def flight_fact(df):
    return (
        immigration
        .pipe(aggregates)
        .pipe(replace_null_stddev, 'age_std')
        .pipe(replace_null_stddev, 'stay_std')
        .pipe(project_time_sk)
        .pipe(project_route_sk)
        .pipe(project_visitor_sk)
        .pipe(project_schema)
    )

In [21]:
etl.save_fact_table(
    immigration.pipe(flight_fact),
    'flight_fact'
)

22/05/09 14:06:38 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [22]:
etl.read_fact_table('flight_fact').toPandas()

Unnamed: 0,time_id,route_id,visitor_id,num_visitors,age_avg,age_std,stay_avg,stay_std
0,635655159808,1331439861761,266287972354,1,29.0,0.0,7.0,0.0
1,498216206336,755914244096,936302870529,1,4.0,0.0,4.0,0.0
2,1279900254208,1529008357378,858993459204,1,44.0,0.0,13.0,0.0
3,377957122048,1271310319617,42949672962,1,32.0,0.0,7.0,0.0
4,979252543488,515396075528,1365799600132,1,60.0,0.0,32.0,0.0
5,188978561024,326417514501,2,1,14.0,0.0,10.0,0.0
6,188978561024,1125281431553,1039382085633,1,28.0,0.0,3.0,0.0
7,188978561024,1494648619014,489626271748,1,21.0,0.0,45.0,0.0
8,592705486848,1005022347267,146028888068,1,48.0,0.0,3.0,0.0
9,1374389534720,1039382085633,438086664194,1,73.0,0.0,74.0,0.0


TODO Handle NULL stay (someone who hasn't left yet)

TODO Handle NULLs in natural keys
- the reason is that those rows will be excluded from join results