In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long
from pyspark.sql.types import DoubleType
from pyspark.sql.types import DateType
import pandas as pd
import re
import configparser
import os
import shutil
from pathlib import Path
from datetime import datetime

In [2]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = "/opt/conda/bin:/opt/spark-2.4.3-bin-hadoop2.7/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/jvm/java-8-openjdk-amd64/bin"
os.environ["SPARK_HOME"] = "/opt/spark-2.4.3-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "/opt/spark-2.4.3-bin-hadoop2.7"
config = configparser.ConfigParser()
config.read('etl.cfg')

os.environ["AWS_ACCESS_KEY_ID"] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"] = config['AWS']['AWS_SECRET_ACCESS_KEY']
AWS_ACCESS_KEY_ID = config['AWS']['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = config['AWS']['AWS_SECRET_ACCESS_KEY']


spark = SparkSession.builder\
        .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0,saurfang:spark-sas7bdat:2.0.0-s_2.11")\
        .config("spark.hadoop.fs.s3a.access.key",AWS_ACCESS_KEY_ID)\
        .config("spark.hadoop.fs.s3a.secret.key",AWS_SECRET_ACCESS_KEY)\
        .enableHiveSupport().getOrCreate()

Our expectations with datasets from data modeling fact&dim tables:
- Relationship between amount of travel immigration and weather duration by month of city.
- Relationship between specific visa type used for a specific city immigration.
- Airline statistic traffic to specific city.
- Ranking US immigration volume from other countries

Explain the data quality checks you'll perform to ensure the pipeline ran as expected. These could include:

* Integrity constraints on the relational database (e.g., unique key, data type, etc.).
* Unit tests for the scripts to ensure they are doing the right thing.
* Source/Count checks to ensure completeness.

Run Quality Checks

In [3]:
# List of parquet files of fact & dim tables
parquet_outputs = './ws_parquet_outputs'

dim_i94addr_dir = './ws_parquet_outputs/dim_i94addr.parquet'
dim_i94port_dir = './ws_parquet_outputs/dim_i94port.parquet'
dim_immi_flight_dir = './ws_parquet_outputs/dim_immi_flight.parquet'
dim_immi_travaller_dir = './ws_parquet_outputs/dim_immi_travaller.parquet'
dim_visa_dir = './ws_parquet_outputs/dim_visa.parquet'
dim_visa_dir = './ws_parquet_outputs/dim_visa.parquet'
fact_i94immi_dir = './ws_parquet_outputs/fact_i94immi.parquet'
fact_worldtempe_dir = './ws_parquet_outputs/fact_worldtempe.parquet'

#### fact_i94immi quality check

In [53]:
fact_i94immi_df = spark.read.parquet(fact_i94immi_dir)

In [54]:
fact_i94immi_df.dtypes

[('travel_cicid', 'double'),
 ('from_country_code', 'double'),
 ('immi_country_code', 'double'),
 ('arrival_port_code', 'string'),
 ('immi_arrival_date', 'timestamp'),
 ('arrival_year', 'int'),
 ('arrival_month', 'int'),
 ('airline_mode_code', 'double'),
 ('immi_state_code', 'string'),
 ('departure_date', 'timestamp'),
 ('traveller_age', 'double'),
 ('visatype_by_number', 'double'),
 ('traveller_birth_year', 'double'),
 ('traveller_sex', 'string'),
 ('immi_flight_code', 'string'),
 ('visatype_by_code', 'string'),
 ('visa_type', 'string')]

In [55]:
fact_i94immi_df.show(3)

+------------+-----------------+-----------------+-----------------+-------------------+------------+-------------+-----------------+---------------+-------------------+-------------+------------------+--------------------+-------------+----------------+----------------+---------+
|travel_cicid|from_country_code|immi_country_code|arrival_port_code|  immi_arrival_date|arrival_year|arrival_month|airline_mode_code|immi_state_code|     departure_date|traveller_age|visatype_by_number|traveller_birth_year|traveller_sex|immi_flight_code|visatype_by_code|visa_type|
+------------+-----------------+-----------------+-----------------+-------------------+------------+-------------+-----------------+---------------+-------------------+-------------+------------------+--------------------+-------------+----------------+----------------+---------+
|   5748517.0|            245.0|            438.0|              LOS|2016-04-30 00:00:00|        2016|            4|              1.0|             CA|2016-

In [56]:
fact_i94immi_df.count()

2377896

In [57]:
# Verify 'travel_cicid' as primarykey
fact_i94immi_df.select(['travel_cicid']).distinct().count()

2377896

In [58]:
fact_i94immi_df.createOrReplaceTempView('fact_i94immi')

In [59]:
# Check query select *
spark.sql("""
        SELECT *
        FROM fact_i94immi
    """).show(3)

+------------+-----------------+-----------------+-----------------+-------------------+------------+-------------+-----------------+---------------+-------------------+-------------+------------------+--------------------+-------------+----------------+----------------+---------+
|travel_cicid|from_country_code|immi_country_code|arrival_port_code|  immi_arrival_date|arrival_year|arrival_month|airline_mode_code|immi_state_code|     departure_date|traveller_age|visatype_by_number|traveller_birth_year|traveller_sex|immi_flight_code|visatype_by_code|visa_type|
+------------+-----------------+-----------------+-----------------+-------------------+------------+-------------+-----------------+---------------+-------------------+-------------+------------------+--------------------+-------------+----------------+----------------+---------+
|   5748517.0|            245.0|            438.0|              LOS|2016-04-30 00:00:00|        2016|            4|              1.0|             CA|2016-

In [60]:
# Check query immigration traffic by city
spark.sql("""
        SELECT 
            arrival_port_code as immigration_city,
            immi_state_code as state,
            travel_cicid as count_immigration
        FROM fact_i94immi
        GROUP BY immigration_city, count_immigration, state
    """).show(10)

+----------------+-----+-----------------+
|immigration_city|state|count_immigration|
+----------------+-----+-----------------+
|             LOS|   CA|        5749653.0|
|             CHI|   CA|        5749741.0|
|             NYC|   NJ|        5750147.0|
|             DAL|   IL|        5750527.0|
|             DAL|   TX|        5750563.0|
|             SAI|   MP|        5750824.0|
|             HHW|   HI|        5751186.0|
|             HHW|   HI|        5751326.0|
|             HHW|   HI|        5751380.0|
|             HHW|   HI|        5751467.0|
+----------------+-----+-----------------+
only showing top 10 rows



In [61]:
# Check query amount immigration by a month of year
spark.sql("""
        SELECT 
            arrival_year as year,
            arrival_month as month,
            travel_cicid as count_immigration
        FROM fact_i94immi
        GROUP BY year, month, count_immigration
    """).show(10)

+----+-----+-----------------+
|year|month|count_immigration|
+----+-----+-----------------+
|2016|    4|        5748522.0|
|2016|    4|        5749622.0|
|2016|    4|        5749755.0|
|2016|    4|        5750351.0|
|2016|    4|        5750521.0|
|2016|    4|        5751193.0|
|2016|    4|        5751414.0|
|2016|    4|        5751432.0|
|2016|    4|        5751713.0|
|2016|    4|        5751716.0|
+----+-----+-----------------+
only showing top 10 rows



#### visa_dim quality check

In [4]:
# dim_visa_dir = './ws_parquet_outputs/dim_visa.parquet'
dim_visa_df = spark.read.parquet(dim_visa_dir)

In [5]:
dim_visa_df.count()

17

In [81]:
dim_visa_df.createOrReplaceTempView('dim_visa')

In [83]:
# Create dim table
dim_visa = spark.sql("""
    SELECT
        i94visa as visatype_by_number,
        visatype as visatype_by_code,
        visa_type as visa_category
    FROM dim_visa
    """)

In [84]:
spark.sql("""
    SELECT COUNT(*)
    FROM dim_visa
    """).show()

+--------+
|count(1)|
+--------+
|      17|
+--------+



#### flight_dim quality check

In [33]:
i94immi_cleaned_dataset = "./i94immi_df_clean"
flight_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv(i94immi_cleaned_dataset)

In [34]:
flight_df.show(3)

+---------+------+------+-------+-------------------+-----+------+-------+-------+-------------------+------+-------+-----+--------+-------+--------+------+------+-------+--------------+-----+--------+---------+
|    cicid|i94cit|i94res|i94port|       arrival_date|i94yr|i94mon|i94mode|i94addr|     departure_date|i94bir|i94visa|count|dtadfile|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|visa_type|
+---------+------+------+-------+-------------------+-----+------+-------+-------+-------------------+------+-------+-----+--------+-------+--------+------+------+-------+--------------+-----+--------+---------+
|5748517.0| 245.0| 438.0|    LOS|2016-04-30 00:00:00| 2016|     4|    1.0|     CA|2016-05-08 00:00:00|  40.0|    1.0|  1.0|20160430| 1976.0|10292016|     F|  null|     QF|9.495387003E10|00011|      B1| Business|
|5748518.0| 245.0| 438.0|    LOS|2016-04-30 00:00:00| 2016|     4|    1.0|     NV|2016-05-17 00:00:00|  32.0|    1.0|  1.0|20160430| 1984.0|10292016|   

In [38]:
flight_df.count()

2377896

In [39]:
flight_df = flight_df.select(['fltno', 'airline', 'i94port'])

In [40]:
flight_df.show(3)

+-----+-------+-------+
|fltno|airline|i94port|
+-----+-------+-------+
|00011|     QF|    LOS|
|00007|     VA|    LOS|
|00040|     DL|    LOS|
+-----+-------+-------+
only showing top 3 rows



In [43]:
flight_df = flight_df.dropDuplicates(['fltno'])

In [44]:
flight_df.count()

6122

In [45]:
flight_df.select(['fltno']).distinct().count()

6122

In [46]:
flight_df.show(20)

+-----+-------+-------+
|fltno|airline|i94port|
+-----+-------+-------+
|00332|    YNT|    MIA|
|00456|     LH|    LOS|
|00530|     DL|    ATL|
|00556|     AC|    VCV|
|02053|     UA|    AGA|
|02070|     BA|    MIA|
|02090|     UA|    WAS|
|03788|     CI|    AGA|
|04319|     YX|    MIA|
|09120|    537|    MIA|
|0990C|     Y4|    OAK|
| 1159|     UA|    SFR|
| 1512|     WS|    CLG|
|1857C|     RV|    MON|
| 2069|     UA|    NYC|
| 2294|     AA|    DAL|
|  296|     DL|    ATL|
| 3606|     WR|    TOR|
| 4032|     UA|    OTT|
| 4821|     OO|    VCV|
+-----+-------+-------+
only showing top 20 rows



In [68]:
dim_visa_df.dtypes

[('visatype_by_number', 'double'),
 ('visatype_by_code', 'string'),
 ('visa_category', 'string')]

In [64]:
dim_visa_df.show(3)

+------------------+----------------+-------------+
|visatype_by_number|visatype_by_code|visa_category|
+------------------+----------------+-------------+
|               1.0|              B1|     Business|
|               1.0|              B1|     Business|
|               1.0|              B1|     Business|
+------------------+----------------+-------------+
only showing top 3 rows



In [69]:
dim_visa_df.count()

2377896

In [66]:
# Verify 'travel_cicid' as primarykey
dim_visa_df.select(['visatype_by_code']).distinct().count()

17

In [None]:
dim_visa_df.createOrReplaceTempView('dim_visa')

In [None]:
# Check query select *
spark.sql("""
        SELECT *
        FROM dim_visa
    """).show(3)

In [None]:
# Check query immigration traffic by city
spark.sql("""
        SELECT 
            visa_type as visa_category,
            visatype as visa_code,
            i94visa as visa_category_by_number
        FROM dim_visa
        GROUP BY visa_category, visa_code, visa_category_by_number
    """).show(20)

#### flight_dim quality check

In [33]:
dim_immi_flight_df = spark.read.parquet(dim_immi_flight_dir)

In [None]:
dim_immi_flight_df.dtypes

In [46]:
dim_immi_flight_df.show(20)

+-----+-------+-------+
|fltno|airline|i94port|
+-----+-------+-------+
|00332|    YNT|    MIA|
|00456|     LH|    LOS|
|00530|     DL|    ATL|
|00556|     AC|    VCV|
|02053|     UA|    AGA|
|02070|     BA|    MIA|
|02090|     UA|    WAS|
|03788|     CI|    AGA|
|04319|     YX|    MIA|
|09120|    537|    MIA|
|0990C|     Y4|    OAK|
| 1159|     UA|    SFR|
| 1512|     WS|    CLG|
|1857C|     RV|    MON|
| 2069|     UA|    NYC|
| 2294|     AA|    DAL|
|  296|     DL|    ATL|
| 3606|     WR|    TOR|
| 4032|     UA|    OTT|
| 4821|     OO|    VCV|
+-----+-------+-------+
only showing top 20 rows



In [38]:
dim_immi_flight_df.count()

2377896

In [45]:
dim_immi_flight_df.select(['fltno']).distinct().count()

6122

In [None]:
dim_immi_flight_df.createOrReplaceTempView('dim_immi_flight')

In [None]:
spark.sql("""
        SELECT 
            flight_brand as airline_name,
            flight_number as filight_no,
            airport_city as destination_city
        FROM dim_immi_flight
        GROUP BY destination_city, airline_name, filight_no
    """).show(20)

+------------+----------+----------------+
|airline_name|filight_no|destination_city|
+------------+----------+----------------+
|         663|     00404|             SAJ|
|          DL|     00712|             DET|
|          VS|     00015|             ORL|
|         0FY|     00549|             PIE|
|          G7|      6296|             BOS|
|          ZX|     07290|             TOR|
|          YV|      5682|             DAL|
|          DL|       588|             SEA|
|          AA|     01259|             TOR|
|         *GA|     N12ND|             FTL|
|          UA|        28|             NEW|
|          UA|     03566|             TOR|
|         *GA|     1284C|             SNA|
|          BA|     01546|             NYC|
|          UA|     06078|             TOR|
|         *GA|     N175P|             ORL|
|          AF|     00436|             HOU|
|         *GA|     XAVMX|             LOS|
|          9E|     04118|             SPM|
|         *GA|     N104D|             FTL|
+----------

#### traveller_dim quality check

In [None]:
dim_immi_travaller_df = spark.read.parquet(dim_immi_travaller_dir)

In [38]:
dim_immi_travaller_df.count()

2377896

In [49]:
dim_immi_travaller_df.show(3)

+---------+------+------+-------+-------------------+-------+-------+------+-------+------+--------+
|    cicid|i94cit|i94res|i94port|       arrival_date|i94mode|i94addr|i94bir|biryear|gender|visatype|
+---------+------+------+-------+-------------------+-------+-------+------+-------+------+--------+
|5748517.0| 245.0| 438.0|    LOS|2016-04-30 00:00:00|    1.0|     CA|  40.0| 1976.0|     F|      B1|
|5748518.0| 245.0| 438.0|    LOS|2016-04-30 00:00:00|    1.0|     NV|  32.0| 1984.0|     F|      B1|
|5748519.0| 245.0| 438.0|    LOS|2016-04-30 00:00:00|    1.0|     WA|  29.0| 1987.0|     M|      B1|
+---------+------+------+-------+-------------------+-------+-------+------+-------+------+--------+
only showing top 3 rows



In [None]:
dim_immi_travaller_df.select(['cicid']).distinct().count()

6122

In [44]:
dim_immi_travaller_df.count()

6122

In [None]:
dim_immi_travaller_df.createOrReplaceTempView('dim_immi_traveller')

#### fact_worldtempe quality check

In [None]:
fact_worldtempe_df = spark.read.parquet(fact_worldtempe_dir)

In [None]:
fact_worldtempe_df.dtypes

In [None]:
fact_worldtempe_df.show(20)

In [None]:
fact_worldtempe_df.count()

2377896

In [None]:
fact_worldtempe_df.select(['measure_date']).distinct().count()

In [None]:
spark.sql("""
        SELECT 
            MAX(measure_date) as lastest_time,
            MIN(measure_date) as oldest_time,
        FROM fact_worldtempe
    """).show()

#### dim_i94port quality check

In [None]:
dim_i94port_df = spark.read.parquet(dim_i94port_dir)

In [None]:
dim_i94port_df.dtypes

In [None]:
dim_i94port_df.show(20)

+-----+-------+-------+
|fltno|airline|i94port|
+-----+-------+-------+
|00332|    YNT|    MIA|
|00456|     LH|    LOS|
|00530|     DL|    ATL|
|00556|     AC|    VCV|
|02053|     UA|    AGA|
|02070|     BA|    MIA|
|02090|     UA|    WAS|
|03788|     CI|    AGA|
|04319|     YX|    MIA|
|09120|    537|    MIA|
|0990C|     Y4|    OAK|
| 1159|     UA|    SFR|
| 1512|     WS|    CLG|
|1857C|     RV|    MON|
| 2069|     UA|    NYC|
| 2294|     AA|    DAL|
|  296|     DL|    ATL|
| 3606|     WR|    TOR|
| 4032|     UA|    OTT|
| 4821|     OO|    VCV|
+-----+-------+-------+
only showing top 20 rows



In [None]:
dim_i94port_df.count()

2377896

In [None]:
dim_i94port_df.select(['|I94PORT_valid_code|']).distinct().count()

6122

#### dim_i94addr quality check

In [None]:
dim_i94addr_df = spark.read.parquet(dim_i94addr_dir)

In [None]:
dim_i94addr_df.dtypes

In [None]:
dim_i94addr_df.show(20)

+-----+-------+-------+
|fltno|airline|i94port|
+-----+-------+-------+
|00332|    YNT|    MIA|
|00456|     LH|    LOS|
|00530|     DL|    ATL|
|00556|     AC|    VCV|
|02053|     UA|    AGA|
|02070|     BA|    MIA|
|02090|     UA|    WAS|
|03788|     CI|    AGA|
|04319|     YX|    MIA|
|09120|    537|    MIA|
|0990C|     Y4|    OAK|
| 1159|     UA|    SFR|
| 1512|     WS|    CLG|
|1857C|     RV|    MON|
| 2069|     UA|    NYC|
| 2294|     AA|    DAL|
|  296|     DL|    ATL|
| 3606|     WR|    TOR|
| 4032|     UA|    OTT|
| 4821|     OO|    VCV|
+-----+-------+-------+
only showing top 20 rows



In [None]:
dim_i94addr_df.count()

2377896

In [None]:
dim_i94addr_df.select(['|I94PORT_valid_code|']).distinct().count()

6122

#### Verify query