In [1]:
import findspark
findspark.init()

import pyspark.sql.types as T
import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

from datetime import datetime, timedelta
import math

from etl import SparkETL
from age import Age
from stay import Stay

In [2]:
etl = SparkETL()
spark = etl.get_spark()

22/05/12 20:41:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
immigration_staging = (
    spark
    .read
    #.schema(parquet_schema)
    #.load(etl.data_sources['immigration'])
    .load('/Users/charly/DataEng2022/de-capstone/datalake/raw/immigration')
)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [4]:
sas_epoc = datetime(1960, 1, 1)
null_date = datetime(9999, 9, 9)

def convert_sas_date(arrdate):
    return sas_epoc + timedelta(days=arrdate) if not math.isnan(arrdate) else null_date

In [5]:
@F.udf(T.DateType())
def convert_sas_date_udf(arrdate):
    return convert_sas_date(arrdate)

In [6]:


@F.udf(T.IntegerType())
def sas_date_to_day_udf(arrdate):
    return convert_sas_date(arrdate).day

In [7]:
@F.udf(T.IntegerType())
def convert_age_udf(age):
    return Age(age).group()

In [8]:
@F.udf(T.IntegerType())
def convert_stay_udf(arrdate, depdate):
    return Stay(arrdate, depdate).group()

In [9]:
def only_air(df):
    return df.where(F.col('i94mode') == 1)

In [10]:
def project_schema(df):
    return (
        df
        .select(
            'year',
            'month_id',
            'day',
            
            convert_sas_date_udf(F.col('arrdate')).alias('arrival_date'),
            SparkETL.ifnull_str_expr('airline'),
            SparkETL.ifnull_str_expr('fltno', 'flight_number'),
            SparkETL.ifnull_str_expr('i94port', 'port_id'),
            SparkETL.ifnull_num_expr('i94cit', 'citizenship_id'),
            SparkETL.ifnull_num_expr('i94res', 'residence_id'),
            F.col('i94bir').cast('int').alias('age'), # not nk, ok if null
            convert_age_udf(F.col('i94bir')).alias('age_id'),
            SparkETL.ifnull_str_expr('gender', 'gender_id'),
            SparkETL.ifnull_num_expr('i94visa', 'visa_id'),
            SparkETL.ifnull_str_expr('i94addr', 'address_id'),
            (F.col('depdate') - F.col('arrdate')).cast('int').alias('stay'), # not nk, ok if null
            convert_stay_udf(F.col('arrdate'), F.col('depdate')).alias('stay_id'),
            F.lit('1').cast('int').alias('count') # take no chances with nulls
        )
)

In [31]:
def clean_immigration(df):
    return (
        immigration_staging
        .pipe(SparkETL.filter_one_month, '2016-12-01')
        .pipe(only_air)
        .pipe(project_schema)
    )

In [32]:
immigration = clean_immigration(immigration_staging)

In [13]:
immigration.limit(1).toPandas()

                                                                                

Unnamed: 0,year,month_id,day,arrival_date,airline,flight_number,port_id,citizenship_id,residence_id,age,age_id,gender_id,visa_id,address_id,stay,stay_id,count
0,2016,2,14,2016-02-14,DL,241,ATL,101,101,21,1,F,3,MI,,4,1


In [14]:
immigration.count()

2481567

In [16]:
# validation, can only have nulls in age and stay
for col in immigration.columns:
    count = immigration.where(F.isnull(F.col(col))).count()
    print(col, count)

year 0
month_id 0
day 0


                                                                                

arrival_date 0
airline 0
flight_number 0
port_id 0
citizenship_id 0
residence_id 0
age 803


                                                                                

age_id 0
gender_id 0
visa_id 0
address_id 0
stay 278644


[Stage 68:====>                                                   (1 + 11) / 12]

stay_id 0
count 0




In [33]:
etl.save_clean_table(
    immigration,
    'immigration',
    partitions=['year', 'month_id'],
    mode='append'
)

22/05/12 20:44:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
22/05/12 20:44:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
22/05/12 20:44:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
22/05/12 20:44:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
22/05/12 20:44:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
22/05/12 20:44:13 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
22/05/12 20:44:13 WARN MemoryManager: Total allocation exceeds 95.

In [34]:
etl.read_clean_table('immigration').count()

41810230

In [35]:
etl.read_clean_table('immigration').limit(1000).toPandas()

Unnamed: 0,day,arrival_date,airline,flight_number,port_id,citizenship_id,residence_id,age,age_id,gender_id,visa_id,address_id,stay,stay_id,count,year,month_id
0,7,2016-08-07,MT,00130,ORL,135,135,5,0,M,2,FL,14.0,2,1,2016,8
1,13,2016-08-13,HA,00460,HHW,254,276,49,3,F,2,HI,5.0,1,1,2016,8
2,8,2016-08-08,QR,755,ATL,258,258,65,4,F,2,AL,54.0,3,1,2016,8
3,15,2016-08-15,BA,67,PHI,213,213,31,2,F,2,PA,7.0,1,1,2016,8
4,7,2016-08-07,AF,00006,NYC,111,111,17,1,M,2,NY,5.0,1,1,2016,8
5,13,2016-08-13,IB,06171,LOS,129,129,34,2,UNKNOWN,2,CA,15.0,2,1,2016,8
6,8,2016-08-08,EK,235,CHI,373,373,49,3,F,2,IA,12.0,2,1,2016,8
7,15,2016-08-15,VS,49,ORL,135,135,56,4,M,2,FL,13.0,2,1,2016,8
8,7,2016-08-07,VS,00021,WAS,116,116,9,1,F,2,VA,22.0,2,1,2016,8
9,13,2016-08-13,KL,00603,LOS,123,123,33,2,F,2,CA,27.0,2,1,2016,8


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 54307)
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/anaconda3/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/anaconda3/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/anaconda3/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/usr/local/opt/apache-spark/libexec/python/pyspark/accumulators.py", line 