In [18]:
import findspark
findspark.init()

import pyspark.sql.types as T
import pyspark.sql.functions as F

from datetime import datetime, timedelta

from etl import SparkETL
from age import Age
from stay import Stay

In [19]:
etl = SparkETL()
spark = etl.get_spark()

In [20]:
schema = T.StructType([
    T.StructField('_c0', T.StringType(), True),
    T.StructField('cicid', T.StringType(), True),
    T.StructField('i94yr', T.StringType(), True),
    T.StructField('i94mon', T.StringType(), True),
    T.StructField('i94cit', T.StringType(), True),
    T.StructField('i94res', T.StringType(), True),
    T.StructField('i94port', T.StringType(), True),
    T.StructField('arrdate', T.FloatType(), True),
    T.StructField('i94mode', T.StringType(), True),
    T.StructField('i94addr', T.StringType(), True),
    T.StructField('depdate', T.FloatType(), True),
    T.StructField('i94bir', T.FloatType(), True),
    T.StructField('i94visa', T.StringType(), True),
    T.StructField('count', T.StringType(), True),
    T.StructField('dtadfile', T.StringType(), True),
    T.StructField('visapost', T.StringType(), True),
    T.StructField('occup', T.StringType(), True),
    T.StructField('entdepa', T.StringType(), True),
    T.StructField('entdepd', T.StringType(), True),
    T.StructField('entdepu', T.StringType(), True),
    T.StructField('matflag', T.StringType(), True),
    T.StructField('biryear', T.StringType(), True),
    T.StructField('dtaddto', T.StringType(), True),
    T.StructField('gender', T.StringType(), True),
    T.StructField('insnum', T.StringType(), True),
    T.StructField('airline', T.StringType(), True),
    T.StructField('admnum', T.StringType(), True),
    T.StructField('fltno', T.StringType(), True),
    T.StructField('visatype', T.StringType(), True)
])

In [21]:
immigration_staging = (
    spark
    .read
    .format('csv')
    .schema(schema)
    .option('format', 'csv')
    .option('header', 'true')
    .load(etl.data_sources['immigration'])
)

In [27]:
sas_epoc = datetime(1960, 1, 1)

@F.udf(T.DateType())
def convert_sas_date(arrdate):
    return sas_epoc + timedelta(days=arrdate)

In [28]:
@F.udf(T.IntegerType())
def convert_age(age):
    return Age(age).group()

In [29]:
@F.udf(T.IntegerType())
def convert_stay(arrdate, depdate):
    return Stay(arrdate, depdate).group()

In [30]:
def clean_immigration(df):
    """
    Description: clean the raw immigration data
    
    Input: a dataframe with the raw immigration records
    
    Output: a dataframe with following schema:
        immigration(
            year,
            month,
            arrival_date,
            airline,
            fight_number,
            port_id,
            citizenship_id,
            residence_id,
            age,
            age_id,
            gender_id,
            visa_id,
            address_state_id,
            stay,
            stay_id
        )
    """
    return (
        df
        .select(
            # partition
            F.col('i94yr').cast('int').alias('year'),
            F.col('i94mon').cast('int').alias('month_id'),
            
            convert_sas_date(F.col('arrdate')).alias('arrival_date'),
            'airline',
            F.col('fltno').alias('flight_number'),
            F.col('i94port').alias('port_id'),
            F.col('i94cit').cast('int').alias('citizenship_id'),
            F.col('i94res').cast('int').alias('residence_id'),
            F.col('i94bir').cast('int').alias('age'),
            convert_age(F.col('i94bir')).alias('age_id'),
            F.col('gender').alias('gender_id'),
            F.col('i94visa').cast('int').alias('visa_id'),
            F.col('i94addr').alias('address_id'),
            (F.col('depdate') - F.col('arrdate')).cast('int').alias('stay'),
            convert_stay(F.col('arrdate'), F.col('depdate')).alias('stay_id')
        )
)

In [31]:
(
    clean_immigration(immigration_staging)

).toPandas()

Unnamed: 0,year,month_id,arrival_date,airline,flight_number,port_id,citizenship_id,residence_id,age,age_id,gender_id,visa_id,address_id,stay,stay_id
0,2016,4,2016-04-22,JL,00782,HHW,209,209,61,4,F,2,HI,7.0,1
1,2016,4,2016-04-23,*GA,XBLNG,MCA,582,582,26,2,M,2,TX,1.0,0
2,2016,4,2016-04-07,LH,00464,OGG,148,112,76,5,M,2,FL,20.0,2
3,2016,4,2016-04-28,QR,00739,LOS,297,297,25,2,M,2,CA,9.0,2
4,2016,4,2016-04-06,,LAND,CHM,111,111,19,1,F,2,NY,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2016,4,2016-04-23,VS,00043,LVG,135,135,32,2,M,2,NV,5.0,1
996,2016,4,2016-04-16,SV,00041,PSP,261,261,35,2,M,1,HI,7.0,1
997,2016,4,2016-04-16,AF,00090,MIA,111,111,39,3,M,2,FL,6.0,1
998,2016,4,2016-04-12,EV,05510,ATL,582,582,35,2,M,1,WI,3.0,1


In [32]:
immigration = clean_immigration(immigration_staging)

In [33]:
etl.save_clean_table(immigration, 'immigration', partitions=['year', 'month_id'])

In [34]:
etl.read_clean_table('immigration').toPandas()

Unnamed: 0,arrival_date,airline,flight_number,port_id,citizenship_id,residence_id,age,age_id,gender_id,visa_id,address_id,stay,stay_id,year,month_id
0,2016-04-22,JL,00782,HHW,209,209,61,4,F,2,HI,7.0,1,2016,4
1,2016-04-23,*GA,XBLNG,MCA,582,582,26,2,M,2,TX,1.0,0,2016,4
2,2016-04-07,LH,00464,OGG,148,112,76,5,M,2,FL,20.0,2,2016,4
3,2016-04-28,QR,00739,LOS,297,297,25,2,M,2,CA,9.0,2,2016,4
4,2016-04-06,,LAND,CHM,111,111,19,1,F,2,NY,3.0,1,2016,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2016-04-23,VS,00043,LVG,135,135,32,2,M,2,NV,5.0,1,2016,4
996,2016-04-16,SV,00041,PSP,261,261,35,2,M,1,HI,7.0,1,2016,4
997,2016-04-16,AF,00090,MIA,111,111,39,3,M,2,FL,6.0,1,2016,4
998,2016-04-12,EV,05510,ATL,582,582,35,2,M,1,WI,3.0,1,2016,4
