In [6]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql.functions import col, udf
from datetime import datetime
import pandas as pd
import re
import configparser
import os

In [7]:
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [8]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [9]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [10]:
func =  udf (lambda x: datetime.strptime(x, '%Y-%m-%d'), DateType())

## Transform to dim_datetime tables

Read out from staging datasets

- I94 Immigration staging table: `i94immi_table`
- World Temperature staging table: `worldtempe_table`
- I94PORT staging table `i94port_table` from SAS_Labels_Description
- Dim table of date that immigration happen `dim_datetime`
- Dim table of airport that immigration allows `dim_port`
- Dim table of immigration records `dim_immi_traveller`
- Dim table of measure times `dim_us_temperature`

In [11]:
worldtempe_df = spark.read.csv("worldtempe_df_clean.csv", header=True)
worldtempe_df.show()

+----------+------------------+-----------------------------+-------+------------+-------------+
|        dt|averagetemperature|averagetemperatureuncertainty|   city|dt_converted|      country|
+----------+------------------+-----------------------------+-------+------------+-------------+
|1960-02-01|             4.995|                        0.325|ABILENE|  1960-02-01|United States|
|1960-03-01| 8.575000000000001|                        0.303|ABILENE|  1960-03-01|United States|
|1960-04-01|            18.452|                        0.282|ABILENE|  1960-04-01|United States|
|1960-05-01|            21.709|          0.28600000000000003|ABILENE|  1960-05-01|United States|
|1960-06-01|            27.714|                        0.387|ABILENE|  1960-06-01|United States|
|1960-07-01|            27.646|                        0.326|ABILENE|  1960-07-01|United States|
|1960-08-01|            27.481|                        0.341|ABILENE|  1960-08-01|United States|
|1960-09-01|            24.413

In [12]:
worldtempe_df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- averagetemperature: string (nullable = true)
 |-- averagetemperatureuncertainty: string (nullable = true)
 |-- city: string (nullable = true)
 |-- dt_converted: string (nullable = true)
 |-- country: string (nullable = true)



In [13]:
worldtempe_df.toPandas().head()

Unnamed: 0,dt,averagetemperature,averagetemperatureuncertainty,city,dt_converted,country
0,1960-02-01,4.995,0.325,ABILENE,1960-02-01,United States
1,1960-03-01,8.575000000000001,0.303,ABILENE,1960-03-01,United States
2,1960-04-01,18.452,0.282,ABILENE,1960-04-01,United States
3,1960-05-01,21.709,0.286,ABILENE,1960-05-01,United States
4,1960-06-01,27.714,0.387,ABILENE,1960-06-01,United States


In [15]:
worldtempe_df.dtypes

[('dt', 'string'),
 ('averagetemperature', 'string'),
 ('averagetemperatureuncertainty', 'string'),
 ('city', 'string'),
 ('dt_converted', 'string'),
 ('country', 'string')]

In [16]:
worldtempe_df = worldtempe_df.withColumn('dt_converted', func(col('dt_converted')))
worldtempe_df.show(2)

+----------+------------------+-----------------------------+-------+------------+-------------+
|        dt|averagetemperature|averagetemperatureuncertainty|   city|dt_converted|      country|
+----------+------------------+-----------------------------+-------+------------+-------------+
|1960-02-01|             4.995|                        0.325|ABILENE|  1960-02-01|United States|
|1960-03-01| 8.575000000000001|                        0.303|ABILENE|  1960-03-01|United States|
+----------+------------------+-----------------------------+-------+------------+-------------+
only showing top 2 rows



In [17]:
worldtempe_df.dtypes

[('dt', 'string'),
 ('averagetemperature', 'string'),
 ('averagetemperatureuncertainty', 'string'),
 ('city', 'string'),
 ('dt_converted', 'date'),
 ('country', 'string')]

In [44]:
worldtempe_df.createOrReplaceTempView('worldtempe_table')

In [45]:
spark.sql("""
    SELECT COUNT(*) as amount_worldtempe_rows
    FROM worldtempe_table
""").show()

+----------------------+
|amount_worldtempe_rows|
+----------------------+
|                165508|
+----------------------+



In [50]:
spark.sql("""
        SELECT 
            dt_converted,
            MONTH(worldtempe_table.dt_converted) as tempe_month,
            YEAR(worldtempe_table.dt_converted) as tempe_year,
            dt,
            city,
            averagetemperature,
            averagetemperatureuncertainty
        FROM worldtempe_table
            """).createOrReplaceTempView('worldtempe_table')

In [20]:
i94immi_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94immi_df_clean")
i94immi_df.show()

+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|    cicid| i94yr|i94mon|       arrival_date|i94res|i94port|arrdate|i94addr|     departure_date|
+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|5341351.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341352.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341353.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341354.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341355.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341356.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NY|2016-05-08 00:00:00|
|5341357.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     TX|2016-05-01 00:00:00|
|5341358.0|2016.0|   4.0|2016-

In [21]:
i94immi_df.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- arrival_date: timestamp (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- departure_date: timestamp (nullable = true)



In [22]:
i94immi_df.createOrReplaceTempView('i94immi_table')

In [16]:
spark.sql("""
    SELECT COUNT(*) as amount_i94immi_rows
    FROM i94immi_table
""").show()

+-------------------+
|amount_i94immi_rows|
+-------------------+
|            2465314|
+-------------------+



In [23]:
i94port_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94port_staging")
i94port_df.show()

+------------------+--------------------+------------------+
|i94port_valid_code|   i94port_city_name|i94port_state_code|
+------------------+--------------------+------------------+
|               ALC|               ALCAN|                AK|
|               ANC|           ANCHORAGE|                AK|
|               BAR|BAKER AAF - BAKER...|                AK|
|               DAC|       DALTONS CACHE|                AK|
|               PIZ|DEW STATION PT LA...|                AK|
|               DTH|        DUTCH HARBOR|                AK|
|               EGL|               EAGLE|                AK|
|               FRB|           FAIRBANKS|                AK|
|               HOM|               HOMER|                AK|
|               HYD|               HYDER|                AK|
|               JUN|              JUNEAU|                AK|
|               5KE|           KETCHIKAN|                AK|
|               KET|           KETCHIKAN|                AK|
|               MOS|MOSE

In [18]:
i94port_df.printSchema()

root
 |-- i94port_valid_code: string (nullable = true)
 |-- i94port_city_name: string (nullable = true)
 |-- i94port_state_code: string (nullable = true)



In [24]:
i94port_df.createOrReplaceTempView('i94port_table')

In [20]:
spark.sql("""
    SELECT COUNT(*) as amount_i94port_rows
    FROM i94port_table
""").show()

+-------------------+
|amount_i94port_rows|
+-------------------+
|                583|
+-------------------+



Create tables from staging datasets

In [25]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
spark.sql("""
        SELECT 
            immi94.departure_date as arrival_date
        FROM i94immi_table as immi94
        LEFT JOIN worldtempe_table as wt
                ON wt.dt_converted = immi94.departure_date
            """).createOrReplaceTempView('dim_datetime')

In [26]:
spark.sql("""
        SELECT 
            dim_datetime.arrival_date,
            MONTH(dim_datetime.arrival_date) as arrival_month, 
            YEAR(dim_datetime.arrival_date) as arrival_year
        FROM dim_datetime
            """).createOrReplaceTempView('dim_datetime')

In [42]:
spark.sql("""
        SELECT *
        FROM dim_datetime
            """).show()

+-------------------+-------------+------------+
|       arrival_date|arrival_month|arrival_year|
+-------------------+-------------+------------+
|2016-05-03 00:00:00|            5|        2016|
|2016-05-03 00:00:00|            5|        2016|
|2016-05-03 00:00:00|            5|        2016|
|2016-05-03 00:00:00|            5|        2016|
|2016-05-03 00:00:00|            5|        2016|
|2016-05-08 00:00:00|            5|        2016|
|2016-05-01 00:00:00|            5|        2016|
|2016-05-02 00:00:00|            5|        2016|
|2016-05-07 00:00:00|            5|        2016|
|2016-04-30 00:00:00|            4|        2016|
|2016-04-30 00:00:00|            4|        2016|
|2016-05-13 00:00:00|            5|        2016|
|2016-05-13 00:00:00|            5|        2016|
|2016-06-01 00:00:00|            6|        2016|
|2016-05-01 00:00:00|            5|        2016|
|2016-05-01 00:00:00|            5|        2016|
|2016-05-01 00:00:00|            5|        2016|
|2016-05-01 00:00:00

In [27]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
spark.sql("""
        SELECT 
            i94port_table.i94port_valid_code as port_code,
            i94port_table.i94port_city_name as city_name, 
            i94port_table.i94port_state_code as state
        FROM i94port_table
            """).createOrReplaceTempView('dim_port')

In [46]:
spark.sql("""
        SELECT *
        FROM dim_port
            """).show()

+---------+--------------------+-----+
|port_code|           city_name|state|
+---------+--------------------+-----+
|      ALC|               ALCAN|   AK|
|      ANC|           ANCHORAGE|   AK|
|      BAR|BAKER AAF - BAKER...|   AK|
|      DAC|       DALTONS CACHE|   AK|
|      PIZ|DEW STATION PT LA...|   AK|
|      DTH|        DUTCH HARBOR|   AK|
|      EGL|               EAGLE|   AK|
|      FRB|           FAIRBANKS|   AK|
|      HOM|               HOMER|   AK|
|      HYD|               HYDER|   AK|
|      JUN|              JUNEAU|   AK|
|      5KE|           KETCHIKAN|   AK|
|      KET|           KETCHIKAN|   AK|
|      MOS|MOSES POINT INTER...|   AK|
|      NIK|             NIKISKI|   AK|
|      NOM|                 NOM|   AK|
|      PKC|         POKER CREEK|   AK|
|      ORI|      PORT LIONS SPB|   AK|
|      SKA|             SKAGWAY|   AK|
|      SNP|     ST. PAUL ISLAND|   AK|
+---------+--------------------+-----+
only showing top 20 rows



In [28]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
spark.sql("""
        SELECT 
            immi94.cicid as immi_cicid,
            ddate.arrival_date as immi_datetime_iso,
            immi94.i94port as arr_port_code
        FROM i94immi_table as immi94
        JOIN dim_datetime as ddate
            ON ddate.arrival_date = immi94.arrival_date
            """).createOrReplaceTempView('dim_immi_traveller')

In [44]:
spark.sql("""
        SELECT *
        FROM dim_immi_traveller
            """).show()

+----------+-------------------+-------------+
|immi_cicid|  immi_datetime_iso|arr_port_code|
+----------+-------------------+-------------+
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2

In [29]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
spark.sql("""
        SELECT 
            immi94.cicid as immi_cicid,
            ddate.arrival_date as immi_datetime_iso,
            immi94.i94port as arr_port_code
        FROM i94immi_table as immi94
        JOIN dim_datetime as ddate
            ON ddate.arrival_date = immi94.arrival_date
            """).createOrReplaceTempView('dim_immi_traveller')

In [30]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
spark.sql("""
        SELECT d_port.city_name as city
        FROM dim_port as d_port
        JOIN i94immi_table as immi94
            ON immi94.i94port = d_port.port_code
        """).show()

+----------------+
|            city|
+----------------+
|          DALLAS|
|          DALLAS|
|          DALLAS|
|          DALLAS|
|          DALLAS|
|          DALLAS|
|          DALLAS|
|          DALLAS|
|          DALLAS|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
|NEWARK/TETERBORO|
+----------------+
only showing top 20 rows



In [69]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
spark.sql("""
        SELECT *
        FROM dim_immi_traveller
     """).show()

+----------+-------------------+-------------+
|immi_cicid|  immi_datetime_iso|arr_port_code|
+----------+-------------------+-------------+
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2016-04-10 00:00:00|          CHI|
| 1633969.0|2

In [70]:
spark.sql("""
        SELECT *
        FROM dim_port
            """).show()

+---------+--------------------+-----+
|port_code|           city_name|state|
+---------+--------------------+-----+
|      ALC|               ALCAN|   AK|
|      ANC|           ANCHORAGE|   AK|
|      BAR|BAKER AAF - BAKER...|   AK|
|      DAC|       DALTONS CACHE|   AK|
|      PIZ|DEW STATION PT LA...|   AK|
|      DTH|        DUTCH HARBOR|   AK|
|      EGL|               EAGLE|   AK|
|      FRB|           FAIRBANKS|   AK|
|      HOM|               HOMER|   AK|
|      HYD|               HYDER|   AK|
|      JUN|              JUNEAU|   AK|
|      5KE|           KETCHIKAN|   AK|
|      KET|           KETCHIKAN|   AK|
|      MOS|MOSES POINT INTER...|   AK|
|      NIK|             NIKISKI|   AK|
|      NOM|                 NOM|   AK|
|      PKC|         POKER CREEK|   AK|
|      ORI|      PORT LIONS SPB|   AK|
|      SKA|             SKAGWAY|   AK|
|      SNP|     ST. PAUL ISLAND|   AK|
+---------+--------------------+-----+
only showing top 20 rows



In [32]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
# dim_port as dport
spark.sql("""
        SELECT 
            dport.city_name as travel_city,
            d_travel.arr_port_code as arr_port_code,
            d_travel.immi_datetime_iso as immi_datetime_iso,
            d_travel.immi_cicid as immi_cicid
        FROM dim_port as dport
        JOIN dim_immi_traveller as d_travel
            ON dport.port_code = d_travel.arr_port_code
     """).createOrReplaceTempView('dim_immi_traveller')

In [None]:
# Crash program
spark.sql("""
        SELECT COUNT(*)
        FROM dim_immi_traveller
            """).show()

In [None]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
# dim_port as dport
spark.sql("""
        SELECT 
            ddate.arrival_date as collected_datetime,
            ddate.arrival_month as tempe_month,
            wt.averagetemperature as avg_tempe,
            wt.city as city_tempe_collect
        FROM dim_datetime as ddate
        LEFT JOIN worldtempe_table as wt
            ON wt.tempe_month = ddate.arrival_month
     """).createOrReplaceTempView('dim_us_temperature')

In [None]:
spark.sql("""
        SELECT *
        FROM dim_us_temperature
     """).show(5)

In [None]:
spark.sql("""
        SELECT 
            city_tempe_collect,
            AVG(avg_tempe)
        FROM dim_us_temperature
        GROUP BY city_tempe_collect
     """).show(20)