In [1]:
import findspark
findspark.init()

import pyspark.sql.functions as F
import pyspark.sql.types as T

from etl import SparkETL
from age import Age
from stay import Stay

Possible (but unlikely) **combinatorial explosion** for foreign_visitor_dim:
```
            'citizenship_id', 200 values
            'residence_id',   200 values
            'age_id',         5 values
            'gender_id',      3 values
            'visa_id',        3 values
            'address_id',     50 values
            'stay_id'         4 values
```

In [2]:
print('potentially num_rows ~= %e' % (200 * 200 * 5 * 3 * 3 * 50 * 4))

potentially num_rows ~= 3.600000e+08


In [3]:
etl = SparkETL()
spark = etl.get_spark()

22/05/09 13:08:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
22/05/09 13:08:03 WARN Utils: Service 'SparkUI' could

In [4]:
immigration = etl.read_clean_table('immigration')

                                                                                

In [5]:
visitor_dim = etl.read_dim_table('foreign_visitor_dim')

In [6]:
def visitor_dim_nk(df):
    return (
        df.select(
            'citizenship_id',
            'residence_id',
            'age_id',
            'gender_id',
            'visa_id',
            'address_id',
            'stay_id'
        )
        .drop_duplicates()
    )

In [7]:
def join_immigration_with_visitor_dim(df):
    return (
        df
        .join(
            visitor_dim,
            on=(
                (df['citizenship_id'] == visitor_dim['citizenship_id'])
                & (df['residence_id'] == visitor_dim['residence_id'])
                & (df['age_id'] == visitor_dim['age_id'])
                & (df['gender_id'] == visitor_dim['gender_id'])
                & (df['visa_id'] == visitor_dim['visa_id'])
                & (df['address_id'] == visitor_dim['address_id'])
                & (df['stay_id'] == visitor_dim['stay_id'])
            ),
            how='leftanti'
        )
    )

In [8]:
def fill_pk(df):
    return df.withColumn('visitor_id', F.monotonically_increasing_id())

In [9]:
def fill_country(df, country, left_on, alias):
    return (
        df
        .join(country, on=df[left_on] == country['country_id'], how='left')
        .withColumnRenamed('country', alias)
        .drop('country_id')
    )

In [10]:
@F.udf(T.StringType())
def fill_age_udf(age_id):
    return Age.descriptions[age_id]

In [11]:
def fill_age(df):
    return df.withColumn('age', fill_age_udf(F.col('age_id')))

In [12]:
def fill_gender(df):
    return df.withColumn(
        'gender',
        F.expr("""
            CASE gender_id
                WHEN 'F' THEN 'Female'
                WHEN 'M' THEN 'Male'
                ELSE gender_id
            END
        """)
    )

In [13]:
def fill_visa(df):
    return df.withColumn(
        'visa',
        F.expr("""
            CASE visa_id
                WHEN 1 THEN 'Business'
                WHEN 2 THEN 'Pleasure'
                WHEN 3 THEN 'Student'
                ELSE visa_id
            END
        """)
    )

In [14]:
def fill_state(df, state):
    return (
        df
        .join(
            state,
            on=df['address_id'] == state['state_id'],
            how='left'
        )
        .drop('state_id')
        .withColumnRenamed('name', 'address_state')
        .withColumnRenamed('type_id', 'address_type_id')
        .withColumnRenamed('type', 'address_type')
    )

In [15]:
def fill_climate(df, temperature):
    return (
        df
        .join(
            temperature,
            on=df['address_id'] == temperature['state_id'],
            how='left'
        )
        .drop('state_id')
        .withColumnRenamed('climate_id', 'address_climate_id')
        .withColumnRenamed('climate', 'address_climate')
    )

In [16]:
@F.udf(T.StringType())
def fill_stay_udf(stay_id):
    return Stay.descriptions[stay_id]

In [17]:
def fill_stay(df):
    return df.withColumn('stay', fill_stay_udf(F.col('stay_id')))

In [18]:
def project_schema(df):
    return df.select(
        'visitor_id',
        'citizenship_id',
        'residence_id',
        'age_id',
        'gender_id',
        'visa_id',
        'address_id',
        'stay_id',
        'citizenship',
        'residence',
        'age',
        'gender',
        'visa',
        'address_state',
        'address_type_id',
        'address_type',
        'address_climate_id',
        'address_climate',
        'stay'
    )

In [19]:
def missing_visitor(df):
    
    country = etl.read_clean_table('country')
    state = etl.read_clean_table('state')
    temperature = etl.read_clean_table('temperature')

    return (
        immigration
        .pipe(visitor_dim_nk)
        .pipe(join_immigration_with_visitor_dim)
        .pipe(fill_pk)
        .pipe(fill_country, country, 'citizenship_id', 'citizenship')
        .pipe(fill_country, country, 'residence_id', 'residence')
        .pipe(fill_age)
        .pipe(fill_gender)
        .pipe(fill_visa)
        .pipe(fill_state, state)
        .pipe(fill_climate, temperature)
        .pipe(fill_stay)
        .pipe(project_schema)
    )

In [20]:
etl.save_dim_table(
    immigration.pipe(missing_visitor),
    'foreign_visitor_dim'
)

22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
22/05/09 13:08:21 WARN MemoryManager: Total allocation exceeds 95.

In [21]:
etl.read_dim_table('foreign_visitor_dim').toPandas()

Unnamed: 0,visitor_id,citizenship_id,residence_id,age_id,gender_id,visa_id,stay_id,address_id,address_climate_id,citizenship,residence,age,gender,visa,stay,address,address_climate
0,781684047872,245,245,2,M,1,3,NJ,2.0,"CHINA, PRC","CHINA, PRC",Millennial (23-38),Male,Business,>30 days,,temperate
1,781684047873,689,689,2,F,2,2,MA,1.0,BRAZIL,BRAZIL,Millennial (23-38),Female,Pleasure,8-30 days,,continental
2,781684047874,268,268,3,M,2,2,CA,3.0,TAIWAN,TAIWAN,Gen X (40-54),Male,Pleasure,8-30 days,,warm
3,781684047875,694,526,2,F,2,1,OH,2.0,PERU,CAYMAN ISLANDS,Millennial (23-38),Female,Pleasure,2-7 days,,temperate
4,781684047876,148,112,3,M,1,2,TN,3.0,,GERMANY,Gen X (40-54),Male,Business,8-30 days,,warm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,300647710720,582,582,3,F,3,3,TX,3.0,MEXICO,MEXICO,Gen X (40-54),Female,Student,>30 days,,warm
851,1108101562368,577,577,3,M,2,3,GA,3.0,GUATEMALA,GUATEMALA,Gen X (40-54),Male,Pleasure,>30 days,,warm
852,1108101562369,130,130,4,F,1,2,NV,2.0,SWEDEN,SWEDEN,Boomer (55-75),Female,Business,8-30 days,,temperate
853,1529008357376,135,135,4,F,2,1,,,UNITED KINGDOM,UNITED KINGDOM,Boomer (55-75),Female,Pleasure,2-7 days,,


TODO: address_type_id and address_climate_id should be int instead of float