In [156]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from datetime import datetime
import pandas as pd
import re
import configparser
import os

In [157]:
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [158]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [159]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [160]:
func =  udf (lambda x: datetime.strptime(x, '%Y-%m-%d'), DateType())

## Transform to dim_datetime tables

Read out from staging datasets

- I94 Immigration staging table: `i94immi_table`
- World Temperature staging table: `worldtempe_table`
- I94PORT staging table `i94port_table` from SAS_Labels_Description

- Fact table `fact_immi_weather` wraps informations from datasets to analyze relation between traveller traffic and weather on a specific city.
- Dim table of date that immigration happen `dim_datetime`.
- Dim table of airport that immigration allows `dim_port`.
- Dim table of immigration records `dim_immi_traveller`.
- Dim table of measure times `dim_us_temperature`.

In [161]:
i94immi_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94immi_df_clean")
i94immi_df.show()

+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|    cicid| i94yr|i94mon|       arrival_date|i94res|i94port|arrdate|i94addr|     departure_date|
+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|5341351.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341352.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341353.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341354.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341355.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341356.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NY|2016-05-08 00:00:00|
|5341357.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     TX|2016-05-01 00:00:00|
|5341358.0|2016.0|   4.0|2016-

In [71]:
i94immi_df.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- arrival_date: timestamp (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- departure_date: timestamp (nullable = true)



In [162]:
# i94immi_table here
i94immi_df.createOrReplaceTempView('i94immi_table')

In [18]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").show(3)

+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|    cicid| i94yr|i94mon|       arrival_date|i94res|i94port|arrdate|i94addr|     departure_date|
+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|5341351.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341352.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341353.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
only showing top 3 rows



In [163]:
spark.sql("""
        SELECT 
            cicid,
            arrival_date,
            YEAR(arrival_date) as i94yr,
            MONTH(arrival_date) as i94mon,
            i94res,
            i94port,
            arrdate,
            i94addr,
            departure_date
        FROM i94immi_table
            """).createOrReplaceTempView('i94immi_table')

In [32]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").show(3)

+---------+-------------------+-----+------+------+-------+-------+-------+-------------------+
|    cicid|       arrival_date|i94yr|i94mon|i94res|i94port|arrdate|i94addr|     departure_date|
+---------+-------------------+-----+------+------+-------+-------+-------+-------------------+
|5341351.0|2016-04-28 00:00:00| 2016|     4| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341352.0|2016-04-28 00:00:00| 2016|     4| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341353.0|2016-04-28 00:00:00| 2016|     4| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
+---------+-------------------+-----+------+------+-------+-------+-------+-------------------+
only showing top 3 rows



In [33]:
spark.sql("""
    SELECT COUNT(*) as amount_i94immi_rows
    FROM i94immi_table
""").show()

+-------------------+
|amount_i94immi_rows|
+-------------------+
|            2465314|
+-------------------+



In [164]:
worldtempe_df = spark.read.csv("worldtempe_df_clean.csv", header=True)
worldtempe_df.show()

+----------+------------------+-----------------------------+-------+------------+-------------+
|        dt|averagetemperature|averagetemperatureuncertainty|   city|dt_converted|      country|
+----------+------------------+-----------------------------+-------+------------+-------------+
|1960-02-01|             4.995|                        0.325|ABILENE|  1960-02-01|United States|
|1960-03-01| 8.575000000000001|                        0.303|ABILENE|  1960-03-01|United States|
|1960-04-01|            18.452|                        0.282|ABILENE|  1960-04-01|United States|
|1960-05-01|            21.709|          0.28600000000000003|ABILENE|  1960-05-01|United States|
|1960-06-01|            27.714|                        0.387|ABILENE|  1960-06-01|United States|
|1960-07-01|            27.646|                        0.326|ABILENE|  1960-07-01|United States|
|1960-08-01|            27.481|                        0.341|ABILENE|  1960-08-01|United States|
|1960-09-01|            24.413

In [121]:
worldtempe_df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- averagetemperature: string (nullable = true)
 |-- averagetemperatureuncertainty: string (nullable = true)
 |-- city: string (nullable = true)
 |-- dt_converted: string (nullable = true)
 |-- country: string (nullable = true)



In [165]:
worldtempe_df.toPandas().head()

Unnamed: 0,dt,averagetemperature,averagetemperatureuncertainty,city,dt_converted,country
0,1960-02-01,4.995,0.325,ABILENE,1960-02-01,United States
1,1960-03-01,8.575000000000001,0.303,ABILENE,1960-03-01,United States
2,1960-04-01,18.452,0.282,ABILENE,1960-04-01,United States
3,1960-05-01,21.709,0.286,ABILENE,1960-05-01,United States
4,1960-06-01,27.714,0.387,ABILENE,1960-06-01,United States


In [123]:
worldtempe_df.dtypes

[('dt', 'string'),
 ('averagetemperature', 'string'),
 ('averagetemperatureuncertainty', 'string'),
 ('city', 'string'),
 ('dt_converted', 'string'),
 ('country', 'string')]

In [166]:
worldtempe_df = worldtempe_df.withColumn("averagetemperature", worldtempe_df["averagetemperature"].cast(DoubleType()).alias("averagetemperature"))

In [167]:
worldtempe_df = worldtempe_df.withColumn('dt_converted', func(col('dt_converted')))
worldtempe_df.show(2)

+----------+------------------+-----------------------------+-------+------------+-------------+
|        dt|averagetemperature|averagetemperatureuncertainty|   city|dt_converted|      country|
+----------+------------------+-----------------------------+-------+------------+-------------+
|1960-02-01|             4.995|                        0.325|ABILENE|  1960-02-01|United States|
|1960-03-01| 8.575000000000001|                        0.303|ABILENE|  1960-03-01|United States|
+----------+------------------+-----------------------------+-------+------------+-------------+
only showing top 2 rows



In [126]:
worldtempe_df.dtypes

[('dt', 'string'),
 ('averagetemperature', 'double'),
 ('averagetemperatureuncertainty', 'string'),
 ('city', 'string'),
 ('dt_converted', 'date'),
 ('country', 'string')]

In [168]:
# worldtempe_table here
worldtempe_df.createOrReplaceTempView('worldtempe_table')

In [128]:
spark.sql("""
    SELECT COUNT(*) as amount_worldtempe_rows
    FROM worldtempe_table
""").show()

+----------------------+
|amount_worldtempe_rows|
+----------------------+
|                165508|
+----------------------+



In [169]:
spark.sql("""
        SELECT 
            dt_converted,
            MONTH(worldtempe_table.dt_converted) as tempe_month,
            YEAR(worldtempe_table.dt_converted) as tempe_year,
            dt,
            city,
            averagetemperature,
            averagetemperatureuncertainty
        FROM worldtempe_table
            """).createOrReplaceTempView('worldtempe_table')

In [130]:
spark.sql("""
    SELECT *
    FROM worldtempe_table
""").show(3)

+------------+-----------+----------+----------+-------+------------------+-----------------------------+
|dt_converted|tempe_month|tempe_year|        dt|   city|averagetemperature|averagetemperatureuncertainty|
+------------+-----------+----------+----------+-------+------------------+-----------------------------+
|  1960-02-01|          2|      1960|1960-02-01|ABILENE|             4.995|                        0.325|
|  1960-03-01|          3|      1960|1960-03-01|ABILENE| 8.575000000000001|                        0.303|
|  1960-04-01|          4|      1960|1960-04-01|ABILENE|            18.452|                        0.282|
+------------+-----------+----------+----------+-------+------------------+-----------------------------+
only showing top 3 rows



In [170]:
spark.sql("""
    SELECT 
        city,
        tempe_month,
        BROUND(AVG(averagetemperature),2) as averagetemperature,
        BROUND(AVG(averagetemperatureuncertainty),2) as averagetemperatureuncertainty,
        tempe_year,
        dt_converted
    FROM worldtempe_table
    GROUP BY city, tempe_month, tempe_year, dt_converted
""").createOrReplaceTempView('worldtempe_table')

In [146]:
spark.sql("""
    SELECT *
    FROM worldtempe_table
    WHERE tempe_month == 4
""").show(3)

+-----------+-----------+------------------+-----------------------------+----------+------------+
|       city|tempe_month|averagetemperature|averagetemperatureuncertainty|tempe_year|dt_converted|
+-----------+-----------+------------------+-----------------------------+----------+------------+
|ALBUQUERQUE|          4|              8.71|                         0.36|      1970|  1970-04-01|
|  ALLENTOWN|          4|              8.72|                         0.18|      1984|  1984-04-01|
|  ANCHORAGE|          4|             -1.29|                         0.43|      1987|  1987-04-01|
+-----------+-----------+------------------+-----------------------------+----------+------------+
only showing top 3 rows



In [171]:
i94port_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94port_staging")
i94port_df.show()

+------------------+--------------------+------------------+
|i94port_valid_code|   i94port_city_name|i94port_state_code|
+------------------+--------------------+------------------+
|               ALC|               ALCAN|                AK|
|               ANC|           ANCHORAGE|                AK|
|               BAR|BAKER AAF - BAKER...|                AK|
|               DAC|       DALTONS CACHE|                AK|
|               PIZ|DEW STATION PT LA...|                AK|
|               DTH|        DUTCH HARBOR|                AK|
|               EGL|               EAGLE|                AK|
|               FRB|           FAIRBANKS|                AK|
|               HOM|               HOMER|                AK|
|               HYD|               HYDER|                AK|
|               JUN|              JUNEAU|                AK|
|               5KE|           KETCHIKAN|                AK|
|               KET|           KETCHIKAN|                AK|
|               MOS|MOSE

In [73]:
i94port_df.printSchema()

root
 |-- i94port_valid_code: string (nullable = true)
 |-- i94port_city_name: string (nullable = true)
 |-- i94port_state_code: string (nullable = true)



In [172]:
# i94port_table
i94port_df.createOrReplaceTempView('i94port_table')

In [75]:
spark.sql("""
    SELECT COUNT(*) as amount_i94port_rows
    FROM i94port_table
""").show()

+-------------------+
|amount_i94port_rows|
+-------------------+
|                583|
+-------------------+



In [76]:
spark.sql("""
    SELECT *
    FROM i94port_table
""").show(3)

+------------------+--------------------+------------------+
|i94port_valid_code|   i94port_city_name|i94port_state_code|
+------------------+--------------------+------------------+
|               ALC|               ALCAN|                AK|
|               ANC|           ANCHORAGE|                AK|
|               BAR|BAKER AAF - BAKER...|                AK|
+------------------+--------------------+------------------+
only showing top 3 rows



Create dim and fact tables from staging datasets

In [173]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# month, year???
spark.sql("""
        SELECT 
            immi94.arrival_date as arrival_date
        FROM i94immi_table as immi94
        LEFT JOIN worldtempe_table as wt
                ON wt.dt_converted = immi94.arrival_date
            """).createOrReplaceTempView('dim_datetime')

In [174]:
spark.sql("""
        SELECT 
            dim_datetime.arrival_date,
            MONTH(dim_datetime.arrival_date) as arrival_month, 
            YEAR(dim_datetime.arrival_date) as arrival_year
        FROM dim_datetime
            """).createOrReplaceTempView('dim_datetime')

In [93]:
spark.sql("""
        SELECT *
        FROM dim_datetime
            """).show(3)

+-------------------+-------------+------------+
|       arrival_date|arrival_month|arrival_year|
+-------------------+-------------+------------+
|2016-04-28 00:00:00|            4|        2016|
|2016-04-28 00:00:00|            4|        2016|
|2016-04-28 00:00:00|            4|        2016|
+-------------------+-------------+------------+
only showing top 3 rows



In [175]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
spark.sql("""
        SELECT 
            i94port_table.i94port_valid_code as port_code,
            i94port_table.i94port_city_name as city_name, 
            i94port_table.i94port_state_code as state
        FROM i94port_table
            """).createOrReplaceTempView('dim_port')

In [92]:
spark.sql("""
        SELECT *
        FROM dim_port
            """).show(3)

+---------+--------------------+-----+
|port_code|           city_name|state|
+---------+--------------------+-----+
|      ALC|               ALCAN|   AK|
|      ANC|           ANCHORAGE|   AK|
|      BAR|BAKER AAF - BAKER...|   AK|
+---------+--------------------+-----+
only showing top 3 rows



In [176]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
spark.sql("""
        SELECT 
            immi94.cicid as immi_cicid,
            ddate.arrival_date as immi_datetime_iso,
            ddate.arrival_month as travel_month,
            ddate.arrival_year as travel_year,
            immi94.i94port as arr_port_code
        FROM i94immi_table as immi94
        JOIN dim_datetime as ddate
            ON ddate.arrival_month = immi94.i94mon
            """).createOrReplaceTempView('dim_immi_traveller')

In [91]:
spark.sql("""
        SELECT *
        FROM dim_immi_traveller
            """).show(3)

+----------+-------------------+------------+-----------+-------------+
|immi_cicid|  immi_datetime_iso|travel_month|travel_year|arr_port_code|
+----------+-------------------+------------+-----------+-------------+
| 5341351.0|2016-04-28 00:00:00|           4|       2016|          DAL|
| 5341351.0|2016-04-28 00:00:00|           4|       2016|          DAL|
| 5341351.0|2016-04-28 00:00:00|           4|       2016|          DAL|
+----------+-------------------+------------+-----------+-------------+
only showing top 3 rows



In [177]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
# dim_port as dport
spark.sql("""
        SELECT 
            d_travel.immi_cicid as immi_cicid,
            d_travel.immi_datetime_iso as immi_datetime_iso,
            d_travel.travel_month as travel_month,
            d_travel.travel_year as travel_year,
            dport.city_name as travel_city,
            d_travel.arr_port_code as arr_port_code
        FROM dim_port as dport
        JOIN dim_immi_traveller as d_travel
            ON dport.port_code = d_travel.arr_port_code
     """).createOrReplaceTempView('dim_immi_traveller')

In [95]:
spark.sql("""
        SELECT *
        FROM dim_immi_traveller
            """).show(3)

+----------+-------------------+------------+-----------+-----------+-------------+
|immi_cicid|  immi_datetime_iso|travel_month|travel_year|travel_city|arr_port_code|
+----------+-------------------+------------+-----------+-----------+-------------+
| 5341351.0|2016-04-28 00:00:00|           4|       2016|     DALLAS|          DAL|
| 5341351.0|2016-04-28 00:00:00|           4|       2016|     DALLAS|          DAL|
| 5341351.0|2016-04-28 00:00:00|           4|       2016|     DALLAS|          DAL|
+----------+-------------------+------------+-----------+-----------+-------------+
only showing top 3 rows



In [178]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
# dim_datetime as ddate
# dim_port as dport
spark.sql("""
        SELECT 
            ddate.arrival_date as collected_datetime,
            ddate.arrival_month as tempe_month,
            wt.averagetemperature as avg_tempe,
            wt.averagetemperatureuncertainty as avg_uncertain_tempe,
            wt.city as city_tempe_collect
        FROM dim_datetime as ddate
        LEFT JOIN worldtempe_table as wt
            ON wt.tempe_month = ddate.arrival_month
     """).createOrReplaceTempView('dim_us_temperature')

In [150]:
spark.sql("""
        SELECT *
        FROM dim_us_temperature
     """).show(5)

+-------------------+-----------+---------+-------------------+------------------+
| collected_datetime|tempe_month|avg_tempe|avg_uncertain_tempe|city_tempe_collect|
+-------------------+-----------+---------+-------------------+------------------+
|2016-04-28 00:00:00|          4|     9.02|               0.28|       WESTMINSTER|
|2016-04-28 00:00:00|          4|    10.73|               0.32|  WEST VALLEY CITY|
|2016-04-28 00:00:00|          4|     7.11|               0.28|  WEST VALLEY CITY|
|2016-04-28 00:00:00|          4|     11.3|                0.4|       WEST JORDAN|
|2016-04-28 00:00:00|          4|     11.2|               0.28|       WEST JORDAN|
+-------------------+-----------+---------+-------------------+------------------+
only showing top 5 rows



### Load to fact_immi_weather

In [179]:
spark.sql("""
        SELECT 
            d_travel.immi_cicid as traveller_cicid,
            d_travel.arr_port_code as arr_port_code,
            d_port.state as arr_state_code,
            d_travel.travel_city as arr_city,
            d_travel.travel_month as arr_month,
            d_travel.travel_year as arr_year
        FROM dim_immi_traveller as d_travel
        JOIN dim_port as d_port
            ON d_port.port_code = d_travel.arr_port_code
     """).createOrReplaceTempView('fact_immi_weather')

In [103]:
spark.sql("""
        SELECT *
        FROM fact_immi_weather
     """).show(3)

+---------------+-------------+--------------+--------+---------+--------+
|traveller_cicid|arr_port_code|arr_state_code|arr_city|arr_month|arr_year|
+---------------+-------------+--------------+--------+---------+--------+
|      5341351.0|          DAL|            TX|  DALLAS|        4|    2016|
|      5341351.0|          DAL|            TX|  DALLAS|        4|    2016|
|      5341351.0|          DAL|            TX|  DALLAS|        4|    2016|
+---------------+-------------+--------------+--------+---------+--------+
only showing top 3 rows



In [180]:
spark.sql("""
        SELECT
            fact.traveller_cicid as traveller_cicid,
            fact.arr_port_code as arr_port_code,
            fact.arr_state_code as arr_state_code,
            fact.arr_city as arr_city,
            fact.arr_month as arr_month,
            fact.arr_year as arr_year
        FROM fact_immi_weather as fact
     """).show(3)

+---------------+-------------+--------------+--------+---------+--------+
|traveller_cicid|arr_port_code|arr_state_code|arr_city|arr_month|arr_year|
+---------------+-------------+--------------+--------+---------+--------+
|      5341351.0|          DAL|            TX|  DALLAS|        4|    2016|
|      5341351.0|          DAL|            TX|  DALLAS|        4|    2016|
|      5341351.0|          DAL|            TX|  DALLAS|        4|    2016|
+---------------+-------------+--------------+--------+---------+--------+
only showing top 3 rows



In [181]:
spark.sql("""
        SELECT 
            fact.traveller_cicid as traveller_cicid,
            fact.arr_port_code as arr_port_code,
            fact.arr_state_code as arr_state_code,
            fact.arr_city as arr_city,
            d_tempe.avg_tempe as avg_tempe,
            d_tempe.avg_uncertain_tempe as avg_uncertain_tempe,
            fact.arr_month as arr_month,
            fact.arr_year as arr_year
        FROM fact_immi_weather as fact
        JOIN dim_us_temperature as d_tempe
            ON d_tempe.city_tempe_collect = fact.arr_city
     """).createOrReplaceTempView('fact_immi_weather')

In [None]:
spark.sql("""
        SELECT *
        FROM fact_immi_weather
     """).show(5)