In [91]:
import findspark
findspark.init()

import pyspark.sql.functions as F
import pyspark.sql.types as T

from etl import SparkETL
from age import Age
from stay import Stay

Possible (but unlikely) **combinatorial explosion** for foreign_visitor_dim:
```
            'citizenship_id', 200 values
            'residence_id',   200 values
            'age_id',         5 values
            'gender_id',      3 values
            'visa_id',        3 values
            'address_id',     50 values
            'stay_id'         4 values
```

In [20]:
print('potentially num_rows ~= %e' % (200 * 200 * 5 * 3 * 3 * 50 * 4))

potentially num_rows ~= 3.600000e+08


In [2]:
etl = SparkETL()
spark = etl.get_spark()

22/05/08 10:25:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/08 10:25:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [102]:
immigration = etl.read_clean_table('immigration')

In [103]:
visitor_dim = etl.read_dim_table('foreign_visitor_dim')

In [104]:
def visitor_dim_nk(df):
    return (
        df.select(
            'citizenship_id',
            'residence_id',
            'age_id',
            'gender_id',
            'visa_id',
            'address_id',
            'stay_id'
        )
        .drop_duplicates()
    )

In [105]:
def join_immigration_with_visitor_dim(df):
    return (
        df
        .join(
            visitor_dim,
            on=(
                (df['citizenship_id'] == visitor_dim['citizenship_id'])
                & (df['residence_id'] == visitor_dim['residence_id'])
                & (df['age_id'] == visitor_dim['age_id'])
                & (df['gender_id'] == visitor_dim['gender_id'])
                & (df['visa_id'] == visitor_dim['visa_id'])
                & (df['address_id'] == visitor_dim['address_id'])
                & (df['stay_id'] == visitor_dim['stay_id'])
            ),
            how='leftanti'
        )
    )

In [106]:
def fill_pk(df):
    return df.withColumn('visitor_id', F.monotonically_increasing_id())

In [107]:
def fill_country(df, country, left_on, alias):
    return (
        df
        .join(country, on=df[left_on] == country['country_id'], how='left')
        .withColumnRenamed('country', alias)
        .drop('country_id')
    )

In [108]:
@F.udf(T.StringType())
def fill_age_udf(age_id):
    return Age.descriptions[age_id]

In [109]:
def fill_age(df):
    return df.withColumn('age', fill_age_udf(F.col('age_id')))

In [110]:
def fill_gender(df):
    return df.withColumn(
        'gender',
        F.expr("""
            CASE gender_id
                WHEN 'F' THEN 'Female'
                WHEN 'M' THEN 'Male'
                ELSE gender_id
            END
        """)
    )

In [111]:
def fill_visa(df):
    return df.withColumn(
        'visa',
        F.expr("""
            CASE visa_id
                WHEN 1 THEN 'Business'
                WHEN 2 THEN 'Pleasure'
                WHEN 3 THEN 'Student'
                ELSE visa_id
            END
        """)
    )

In [146]:
def fill_state(df, state):
    return (
        df
        .join(
            state,
            on=df['address_id'] == state['state_id'],
            how='left'
        )
        .drop('state_id')
        .withColumnRenamed('name', 'address_state')
        .withColumnRenamed('type_id', 'address_type_id')
        .withColumnRenamed('type', 'address_type')
    )

In [113]:
def fill_climate(df, temperature):
    return (
        df
        .join(
            temperature,
            on=df['address_id'] == temperature['state_id'],
            how='left'
        )
        .drop('state_id')
        .withColumnRenamed('climate_id', 'address_climate_id')
        .withColumnRenamed('climate', 'address_climate')
    )

In [114]:
@F.udf(T.StringType())
def fill_stay_udf(stay_id):
    return Stay.descriptions[stay_id]

In [115]:
def fill_stay(df):
    return df.withColumn('stay', fill_stay_udf(F.col('stay_id')))

In [148]:
def project_schema(df):
    return df.select(
        'visitor_id',
        'citizenship_id',
        'residence_id',
        'age_id',
        'gender_id',
        'visa_id',
        'address_id',
        'stay_id',
        'citizenship',
        'residence',
        'age',
        'gender',
        'visa',
        'address_state',
        'address_type_id',
        'address_type',
        'address_climate_id',
        'address_climate',
        'stay'
    )

In [149]:
country = etl.read_clean_table('country')
state = etl.read_clean_table('state')
temperature = etl.read_clean_table('temperature')

(
    immigration
    .pipe(visitor_dim_nk)
    .pipe(join_immigration_with_visitor_dim)
    .pipe(fill_pk)
    .pipe(fill_country, country, 'citizenship_id', 'citizenship')
    .pipe(fill_country, country, 'residence_id', 'residence')
    .pipe(fill_age)
    .pipe(fill_gender)
    .pipe(fill_visa)
    .pipe(fill_state, state)
    .pipe(fill_climate, temperature)
    .pipe(fill_stay)
    .pipe(project_schema)
).toPandas()

                                                                                

Unnamed: 0,visitor_id,citizenship_id,residence_id,age_id,gender_id,visa_id,address_id,stay_id,citizenship,residence,age,gender,visa,address_state,address_type_id,address_type,address_climate_id,address_climate,stay
0,0,135,135,2,F,2,FL,2,UNITED KINGDOM,UNITED KINGDOM,Millennial (23-38),Female,Pleasure,Florida,0.0,State,4.0,tropical,8-30 days
1,1,111,111,3,M,2,FL,2,FRANCE,FRANCE,Gen X (40-54),Male,Pleasure,Florida,0.0,State,4.0,tropical,8-30 days
2,2,687,687,1,M,2,NY,2,ARGENTINA,ARGENTINA,Gen Z (7-22),Male,Pleasure,New York,0.0,State,1.0,continental,8-30 days
3,8589934592,124,124,2,M,2,CA,2,NORWAY,NORWAY,Millennial (23-38),Male,Pleasure,California,0.0,State,3.0,warm,8-30 days
4,8589934593,135,135,3,,2,CA,0,UNITED KINGDOM,UNITED KINGDOM,Gen X (40-54),,Pleasure,California,0.0,State,3.0,warm,1 day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,1700807049221,582,582,3,M,2,DC,1,MEXICO,MEXICO,Gen X (40-54),Male,Pleasure,District of Columbia,1.0,Federal District,,,2-7 days
885,1709396983808,135,135,3,F,2,NE,1,UNITED KINGDOM,UNITED KINGDOM,Gen X (40-54),Female,Pleasure,Nebraska,0.0,State,2.0,temperate,2-7 days
886,1709396983809,602,602,2,M,2,FL,1,SURINAME,SURINAME,Millennial (23-38),Male,Pleasure,Florida,0.0,State,4.0,tropical,2-7 days
887,1709396983810,218,135,2,F,1,NY,1,CYPRUS,UNITED KINGDOM,Millennial (23-38),Female,Business,New York,0.0,State,1.0,continental,2-7 days


TODO: address_type_id and address_climate_id should be int instead of float