In [28]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [2]:
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [3]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [4]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

## Transform to dim tables

Read out from staging datasets

In [5]:
worldtempe_df = spark.read.csv("worldtempe_df_clean.csv", header=True)
worldtempe_df.show()

+----------+------------------+-----------------------------+-------+------------+-------------+
|        dt|averagetemperature|averagetemperatureuncertainty|   city|dt_converted|      country|
+----------+------------------+-----------------------------+-------+------------+-------------+
|1960-02-01|             4.995|                        0.325|ABILENE|  1960-02-01|United States|
|1960-03-01| 8.575000000000001|                        0.303|ABILENE|  1960-03-01|United States|
|1960-04-01|            18.452|                        0.282|ABILENE|  1960-04-01|United States|
|1960-05-01|            21.709|          0.28600000000000003|ABILENE|  1960-05-01|United States|
|1960-06-01|            27.714|                        0.387|ABILENE|  1960-06-01|United States|
|1960-07-01|            27.646|                        0.326|ABILENE|  1960-07-01|United States|
|1960-08-01|            27.481|                        0.341|ABILENE|  1960-08-01|United States|
|1960-09-01|            24.413

In [23]:
worldtempe_df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- averagetemperature: string (nullable = true)
 |-- averagetemperatureuncertainty: string (nullable = true)
 |-- city: string (nullable = true)
 |-- dt_converted: string (nullable = true)
 |-- country: string (nullable = true)



In [6]:
worldtempe_df.createOrReplaceTempView('worldtempe_table')

In [7]:
spark.sql("""
    SELECT COUNT(*) as amount_worldtempe_rows
    FROM worldtempe_table
""").show()

+-------------------+
|amount_i94immi_rows|
+-------------------+
|             165508|
+-------------------+



In [8]:
i94immi_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94immi_df_clean")
i94immi_df.show()

+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|    cicid| i94yr|i94mon|       arrival_date|i94res|i94port|arrdate|i94addr|     departure_date|
+---------+------+------+-------------------+------+-------+-------+-------+-------------------+
|5341351.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341352.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341353.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341354.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341355.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NV|2016-05-03 00:00:00|
|5341356.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     NY|2016-05-08 00:00:00|
|5341357.0|2016.0|   4.0|2016-04-28 00:00:00| 575.0|    DAL|20572.0|     TX|2016-05-01 00:00:00|
|5341358.0|2016.0|   4.0|2016-

In [24]:
i94immi_df.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- arrival_date: timestamp (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- departure_date: timestamp (nullable = true)



In [9]:
i94immi_df.createOrReplaceTempView('i94immi_table')

In [10]:
spark.sql("""
    SELECT COUNT(*) as amount_i94immi_rows
    FROM i94immi_table
""").show()

+-------------------+
|amount_i94immi_rows|
+-------------------+
|            2465314|
+-------------------+



In [11]:
i94port_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94port_staging")
i94port_df.show()

+------------------+--------------------+------------------+
|i94port_valid_code|   i94port_city_name|i94port_state_code|
+------------------+--------------------+------------------+
|               ALC|               ALCAN|                AK|
|               ANC|           ANCHORAGE|                AK|
|               BAR|BAKER AAF - BAKER...|                AK|
|               DAC|       DALTONS CACHE|                AK|
|               PIZ|DEW STATION PT LA...|                AK|
|               DTH|        DUTCH HARBOR|                AK|
|               EGL|               EAGLE|                AK|
|               FRB|           FAIRBANKS|                AK|
|               HOM|               HOMER|                AK|
|               HYD|               HYDER|                AK|
|               JUN|              JUNEAU|                AK|
|               5KE|           KETCHIKAN|                AK|
|               KET|           KETCHIKAN|                AK|
|               MOS|MOSE

In [29]:
i94port_df.printSchema()

root
 |-- i94port_valid_code: string (nullable = true)
 |-- i94port_city_name: string (nullable = true)
 |-- i94port_state_code: string (nullable = true)



In [13]:
i94port_df.createOrReplaceTempView('i94port_table')

In [14]:
spark.sql("""
    SELECT COUNT(*) as amount_i94port_rows
    FROM i94port_table
""").show()

+-------------------+
|amount_i94port_rows|
+-------------------+
|                583|
+-------------------+



In [33]:
# i94immi_table as immi94
# worldtempe_table as wt
# i94port_table as port
spark.sql("""
        SELECT 
            immi.i94addr as state_code,
        FROM i94immi_table as immi94
            JOIN i94port_table as port
                ON port.i94port_state_code = immi94.i94addr
            """).show()

ParseException: "\nmismatched input 'as' expecting <EOF>(line 4, pos 27)\n\n== SQL ==\n\n        SELECT \n            immi.i94addr as state_code,\n        FROM i94immi_table as immi94\n---------------------------^^^\n            JOIN i94port_table as port\n                ON port.i94port_state_code = immi94.i94addr\n            \n"

Create tables from staging dataset