In [1]:
import findspark
findspark.init()

import pyspark.sql.functions as F

from etl import SparkETL
from dim import TimeDim

In [2]:
etl = SparkETL()
spark = etl.get_spark()
dim_helper = TimeDim()

22/05/13 19:00:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/13 19:00:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
immigration = etl.read_clean_table('immigration')

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [4]:
def time_dim_nk(df):
    return (
        df
        .select(
            'arrival_date'
        )
        .distinct()
    )

In [5]:
def join_immigration_time_dim(df):
    
    time_dim = etl.read_dim_table('time_dim')
    
    return (
        df
        .join(
            time_dim,
            on= dim_helper.on_nk(df, time_dim, df_keys=['arrival_date']),
            how='leftanti'
        )
    )

In [6]:
def fill_time_dim(df):
    return df.select(
        F.expr(dim_helper.gen_sk_expr(df_keys=['arrival_date'])).alias('time_id'),
        F.col('arrival_date').alias('date'),
        F.expr('YEAR(arrival_date)').alias('year'),
        F.expr('MONTH(arrival_date)').alias('month_id'),
        F.expr('DAY(arrival_date)').alias('day'),
        F.expr('WEEKDAY(arrival_date)').alias('weekday_id'),
        F.expr("DATE_FORMAT(arrival_date, 'MMM')").alias('month'),
        F.expr("DATE_FORMAT(arrival_date, 'E')").alias('weekday'),
        F.expr("DATE_FORMAT(arrival_date, 'E') IN ('Sat', 'Sun')").alias('weekend')
    )

In [8]:
def time_dim_append(date):
    return (
        immigration
        .pipe(SparkETL.filter_one_month, date)
        .pipe(time_dim_nk)
        .pipe(join_immigration_time_dim)
        .pipe(fill_time_dim)
    )

In [13]:
from datetime import datetime

for month in range(1,13):
    date = f"2016-{month:02d}-01"
    print(date)
    etl.save_dim_table(time_dim_append(date), 'time_dim')

2016-01-01
2016-02-01
2016-03-01
2016-04-01
2016-05-01
2016-06-01
2016-07-01
2016-08-01
2016-09-01
2016-10-01
2016-11-01
2016-12-01


In [20]:
import pandas as pd
pd.set_option('display.max_rows', 1000)

In [25]:
etl.read_dim_table('time_dim').toPandas()

Unnamed: 0,time_id,date,year,month_id,day,weekday_id,month,weekday,weekend
0,f664c47e8d2ba2e11914650867a5bc73,2016-12-20,2016,12,20,1,Dec,Tue,False
1,69020354d93aaeb5602d2207c14e210f,2016-12-05,2016,12,5,0,Dec,Mon,False
2,59f958febbbe6df090111c5375ad42fd,2016-12-06,2016,12,6,1,Dec,Tue,False
3,bcebdc65c6d67ba5df54861e717d0577,2016-12-11,2016,12,11,6,Dec,Sun,True
4,90184858ef21ebb00231b46bd9efa56c,2016-12-13,2016,12,13,1,Dec,Tue,False
5,50cdd6c4df4ee96e61a8975c8cfef964,2016-12-18,2016,12,18,6,Dec,Sun,True
6,a0c070d7f4564ddf51ea6555526c5179,2016-12-15,2016,12,15,3,Dec,Thu,False
7,5fc496aaf4b193339236fde45ca13ef0,2016-12-16,2016,12,16,4,Dec,Fri,False
8,8f95cb79987e8864a34a8d26a46bff83,2016-12-01,2016,12,1,3,Dec,Thu,False
9,c49b2534fe83c6065ff909e19da76b48,2016-12-23,2016,12,23,4,Dec,Fri,False


In [14]:
etl.read_dim_table('time_dim').groupby('month_id').count().sort('month_id').toPandas()

Unnamed: 0,month_id,count
0,1,31
1,2,29
2,3,31
3,4,30
4,5,31
5,7,31
6,8,31
7,9,30
8,10,31
9,11,30
