In [38]:
import findspark
findspark.init()

import pyspark.sql.types as T
import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

from datetime import datetime, timedelta

from etl import SparkETL
from age import Age
from stay import Stay

In [2]:
etl = SparkETL()
spark = etl.get_spark()

22/05/09 11:17:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 11:17:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [35]:
schema = T.StructType([
    T.StructField('_c0', T.StringType(), True),
    T.StructField('cicid', T.StringType(), True),
    T.StructField('i94yr', T.StringType(), True),
    T.StructField('i94mon', T.StringType(), True),
    T.StructField('i94cit', T.StringType(), True),
    T.StructField('i94res', T.StringType(), True),
    T.StructField('i94port', T.StringType(), True),
    T.StructField('arrdate', T.FloatType(), True),
    T.StructField('i94mode', T.FloatType(), True),
    T.StructField('i94addr', T.StringType(), True),
    T.StructField('depdate', T.FloatType(), True),
    T.StructField('i94bir', T.FloatType(), True),
    T.StructField('i94visa', T.StringType(), True),
    T.StructField('count', T.FloatType(), True),
    T.StructField('dtadfile', T.StringType(), True),
    T.StructField('visapost', T.StringType(), True),
    T.StructField('occup', T.StringType(), True),
    T.StructField('entdepa', T.StringType(), True),
    T.StructField('entdepd', T.StringType(), True),
    T.StructField('entdepu', T.StringType(), True),
    T.StructField('matflag', T.StringType(), True),
    T.StructField('biryear', T.StringType(), True),
    T.StructField('dtaddto', T.StringType(), True),
    T.StructField('gender', T.StringType(), True),
    T.StructField('insnum', T.StringType(), True),
    T.StructField('airline', T.StringType(), True),
    T.StructField('admnum', T.StringType(), True),
    T.StructField('fltno', T.StringType(), True),
    T.StructField('visatype', T.StringType(), True),
    
])

In [36]:
immigration_staging = (
    spark
    .read
    .format('csv')
    .schema(schema)
    .option('format', 'csv')
    .option('header', 'true')
    .load(etl.data_sources['immigration'])
)

In [39]:
immigration_staging.limit(1).toPandas()

22/05/09 11:21:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , cicid, i94yr, i94mon, i94cit, i94res, i94port, arrdate, i94mode, i94addr, depdate, i94bir, i94visa, count, dtadfile, visapost, occup, entdepa, entdepd, entdepu, matflag, biryear, dtaddto, gender, insnum, airline, admnum, fltno, visatype
 Schema: _c0, cicid, i94yr, i94mon, i94cit, i94res, i94port, arrdate, i94mode, i94addr, depdate, i94bir, i94visa, count, dtadfile, visapost, occup, entdepa, entdepd, entdepu, matflag, biryear, dtaddto, gender, insnum, airline, admnum, fltno, visatype
Expected: _c0 but found: 
CSV file: file:///Users/charly/DataEng2022/de-capstone/immigration_data_sample.csv


Unnamed: 0,_c0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,,M,1955.0,7202016,F,,JL,56582674633.0,782,WT


In [52]:
sas_epoc = datetime(1960, 1, 1)

def convert_sas_date(arrdate):
    return sas_epoc + timedelta(days=arrdate)

In [53]:
@F.udf(T.DateType())
def convert_sas_date_udf(arrdate):
    return convert_sas_date(arrdate)

In [54]:


@F.udf(T.IntegerType())
def sas_date_to_day_udf(arrdate):
    return convert_sas_date(arrdate).day

In [55]:
@F.udf(T.IntegerType())
def convert_age_udf(age):
    return Age(age).group()

In [56]:
@F.udf(T.IntegerType())
def convert_stay_udf(arrdate, depdate):
    return Stay(arrdate, depdate).group()

In [57]:
def only_air(df):
    return df.where(F.col('i94mode') == 1)

In [58]:
def project_schema(df):
    return (
        df
        .select(
            # partition
            F.col('i94yr').cast('int').alias('year'),
            F.col('i94mon').cast('int').alias('month_id'),
            sas_date_to_day_udf(F.col('arrdate')).alias('day'),
            convert_sas_date_udf(F.col('arrdate')).alias('arrival_date'),
            'airline',
            F.col('fltno').alias('flight_number'),
            F.col('i94port').alias('port_id'),
            F.col('i94cit').cast('int').alias('citizenship_id'),
            F.col('i94res').cast('int').alias('residence_id'),
            F.col('i94bir').cast('int').alias('age'),
            convert_age_udf(F.col('i94bir')).alias('age_id'),
            F.col('gender').alias('gender_id'),
            F.col('i94visa').cast('int').alias('visa_id'),
            F.col('i94addr').alias('address_id'),
            (F.col('depdate') - F.col('arrdate')).cast('int').alias('stay'),
            convert_stay_udf(F.col('arrdate'), F.col('depdate')).alias('stay_id'),
            'count'
        )
)

In [59]:
def clean_immigration(df):
    return (
        immigration_staging
        .pipe(only_air)
        .pipe(project_schema)
    )

In [60]:
immigration = clean_immigration(immigration_staging)

In [62]:
etl.save_clean_table(immigration, 'immigration', partitions=['year', 'month_id'])

In [64]:
etl.read_clean_table('immigration').toPandas()

Unnamed: 0,day,arrival_date,airline,flight_number,port_id,citizenship_id,residence_id,age,age_id,gender_id,visa_id,address_id,stay,stay_id,count,year,month_id
0,22,2016-04-22,JL,00782,HHW,209,209,61,4,F,2,HI,7.0,1,1.0,2016,4
1,23,2016-04-23,*GA,XBLNG,MCA,582,582,26,2,M,2,TX,1.0,0,1.0,2016,4
2,7,2016-04-07,LH,00464,OGG,148,112,76,5,M,2,FL,20.0,2,1.0,2016,4
3,28,2016-04-28,QR,00739,LOS,297,297,25,2,M,2,CA,9.0,2,1.0,2016,4
4,8,2016-04-08,DL,910,ATL,577,577,51,3,M,2,GA,54.0,3,1.0,2016,4
5,12,2016-04-12,CX,870,SFR,245,245,48,3,F,2,CA,79.0,3,1.0,2016,4
6,2,2016-04-02,BA,00117,NYC,113,135,33,2,F,2,NY,8.0,2,1.0,2016,4
7,28,2016-04-28,LX,00008,CHI,131,131,39,3,,2,IL,3.0,1,1.0,2016,4
8,1,2016-04-01,AA,00109,LOS,116,116,35,2,,2,CA,8.0,2,1.0,2016,4
9,7,2016-04-07,QF,00015,LOS,438,438,4,0,F,2,CA,14.0,2,1.0,2016,4
