In [3]:
import findspark
findspark.init()

import pyspark.sql.types as T
import pyspark.sql.functions as F

from datetime import datetime, timedelta

from etl import SparkETL

In [4]:
etl = SparkETL()

22/05/05 11:36:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
schema = T.StructType([
    T.StructField('_c0', T.StringType(), True),
    T.StructField('cicid', T.StringType(), True),
    T.StructField('i94yr', T.StringType(), True),
    T.StructField('i94mon', T.StringType(), True),
    T.StructField('i94cit', T.StringType(), True),
    T.StructField('i94res', T.StringType(), True),
    T.StructField('i94port', T.StringType(), True),
    T.StructField('arrdate', T.FloatType(), True),
    T.StructField('i94mode', T.StringType(), True),
    T.StructField('i94addr', T.StringType(), True),
    T.StructField('depdate', T.FloatType(), True),
    T.StructField('i94bir', T.FloatType(), True),
    T.StructField('i94visa', T.StringType(), True),
    T.StructField('count', T.StringType(), True),
    T.StructField('dtadfile', T.StringType(), True),
    T.StructField('visapost', T.StringType(), True),
    T.StructField('occup', T.StringType(), True),
    T.StructField('entdepa', T.StringType(), True),
    T.StructField('entdepd', T.StringType(), True),
    T.StructField('entdepu', T.StringType(), True),
    T.StructField('matflag', T.StringType(), True),
    T.StructField('biryear', T.StringType(), True),
    T.StructField('dtaddto', T.StringType(), True),
    T.StructField('gender', T.StringType(), True),
    T.StructField('insnum', T.StringType(), True),
    T.StructField('airline', T.StringType(), True),
    T.StructField('admnum', T.StringType(), True),
    T.StructField('fltno', T.StringType(), True),
    T.StructField('visatype', T.StringType(), True)
])

In [6]:
immigration_staging = (
    spark
    .read
    .format('csv')
    .schema(schema)
    .option('format', 'csv')
    .option('header', 'true')
    .load("../../immigration_data_sample.csv")
)

NameError: name 'spark' is not defined

In [None]:
sas_epoc = datetime(1960, 1, 1)

@F.udf(T.DateType())
def convert_sas_date(arrdate):
    return sas_epoc + timedelta(days=arrdate)

In [None]:
class Age():
    
    descriptions = {
        0: 'Preschool (-7)',
        1: 'Gen Z (7-22)',
        2: 'Millennial (23-38)',
        3: 'Gen X (40-54)',
        4: 'Boomer (55-75)',
        5: 'Senior (>75)',
    }

    infant = 0
    genz = 1
    millenial = 2
    genx = 3
    boomer = 4
    senior = 5
    
    def __init__(self, age):
        self.age = age
        
    def group(self):
        if self.age <= 7:
            return self.infant
        if self.age <= 22:
            return self.genz
        elif self.age <= 38:
            return self.millenial
        elif self.age <=54:
            return self.genx
        elif self.age <= 75:
            return self.boomer
        else:
            return self.senior
        
    def description(self):
        return self.descriptions[self.group()]

In [None]:
class Stay():
    
    descriptions = {
        0: '1 day',
        1: '2-7 days',
        2: '8-30 days',
        3: '>30 days',
        4: 'not departed'
    }

    day = 0
    week = 1
    month = 2
    long = 3
    not_departed = 4
    
    def __init__(self, arrdate, depdate):
        self.arrdate = arrdate
        self.depdate = depdate
        
    def group(self):
        if not self.depdate:
            return self.not_departed
        else:
            self.stay = self.depdate - self.arrdate
            
        if self.stay <= 1:
            return self.day
        elif self.stay <= 7:
            return self.week
        elif self.stay <= 30:
            return self.month
        else:
            return self.long
        
    def description(self):
        return self.descriptions[self.group()]

In [None]:
@F.udf(T.IntegerType())
def convert_age(age):
    return Age(age).group()

In [None]:
@F.udf(T.IntegerType())
def convert_stay(arrdate, depdate):
    return Stay(arrdate, depdate).group()

In [8]:
def clean_immigration(df):
    return (
        df
        .select(
            # partition
            F.col('i94yr').cast('int').alias('year'),
            F.col('i94mon').cast('int').alias('month'),
            
            convert_sas_date(F.col('arrdate')).alias('arrival_date'),
            'airline',
            F.col('fltno').alias('flight_number'),
            F.col('i94port').alias('port_id'),
            F.col('i94cit').cast('int').alias('citizenship_id'),
            F.col('i94res').cast('int').alias('residence_id'),
            F.col('i94bir').cast('int').alias('age'),
            convert_age(F.col('i94bir')).alias('age_id'),
            F.col('gender').alias('gender_id'),
            F.col('i94visa').cast('int').alias('visa_id'),
            F.col('i94addr').alias('address_state_id'),
            (F.col('depdate') - F.col('arrdate')).cast('int').alias('stay'),
            convert_stay(F.col('arrdate'), F.col('depdate')).alias('stay_id')
        )
)

In [9]:
(
    clean_immigration(immigration_staging)

).show(1000)

NameError: name 'immigration_staging' is not defined

In [13]:
immigration = clean_immigration(immigration_staging)

In [14]:
etl.save_clean_table(immigration, 'immigration', partition=['year', 'month'])

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

In [15]:
etl.read_clean_table('immigration').show(1)

+------------+-------+-------------+-------+--------------+------------+---+------+---------+-------+----------------+----+-------+----+-----+
|arrival_date|airline|flight_number|port_id|citizenship_id|residence_id|age|age_id|gender_id|visa_id|address_state_id|stay|stay_id|year|month|
+------------+-------+-------------+-------+--------------+------------+---+------+---------+-------+----------------+----+-------+----+-----+
|  2016-04-22|     JL|        00782|    HHW|           209|         209| 61|     4|        F|      2|              HI|   7|      1|2016|    4|
+------------+-------+-------------+-------+--------------+------------+---+------+---------+-------+----------------+----+-------+----+-----+
only showing top 1 row

