In [None]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long
from pyspark.sql.types import DoubleType
from pyspark.sql.types import DateType
import pandas as pd
import re
import configparser
import os
import shutil
from pathlib import Path
from datetime import datetime

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = "/opt/conda/bin:/opt/spark-2.4.3-bin-hadoop2.7/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/jvm/java-8-openjdk-amd64/bin"
os.environ["SPARK_HOME"] = "/opt/spark-2.4.3-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "/opt/spark-2.4.3-bin-hadoop2.7"
config = configparser.ConfigParser()
config.read('etl.cfg')

os.environ["AWS_ACCESS_KEY_ID"] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"] = config['AWS']['AWS_SECRET_ACCESS_KEY']
AWS_ACCESS_KEY_ID = config['AWS']['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = config['AWS']['AWS_SECRET_ACCESS_KEY']


spark = SparkSession.builder\
        .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0,saurfang:spark-sas7bdat:2.0.0-s_2.11")\
        .config("spark.hadoop.fs.s3a.access.key",AWS_ACCESS_KEY_ID)\
        .config("spark.hadoop.fs.s3a.secret.key",AWS_SECRET_ACCESS_KEY)\
        .enableHiveSupport().getOrCreate()

Our expectations with datasets from data modeling fact&dim tables:
- Relationship between amount of travel immigration and weather duration by month of city.
- Relationship between specific visa type used for a specific city immigration.
- Airline statistic traffic to specific city.
- Ranking US immigration volume from other countries

Explain the data quality checks you'll perform to ensure the pipeline ran as expected. These could include:

* Integrity constraints on the relational database (e.g., unique key, data type, etc.).
* Unit tests for the scripts to ensure they are doing the right thing.
* Source/Count checks to ensure completeness.

Run Quality Checks

In [1]:
# List of parquet files of fact & dim tables
parquet_outputs = './ws_parquet_outputs'

fact_i94immi_parquet_outputs = parquet_outputs + '/fact_i94immi.parquet'
dim_visa_parquet_outputs = parquet_outputs + '/dim_visa.parquet'
dim_immi_flight_parquet_outputs = parquet_outputs + '/dim_immi_flight.parquet'
dim_immi_travaller_parquet_outputs = parquet_outputs + '/dim_immi_travaller.parquet'
fact_worldtempe_parquet_outputs = parquet_outputs + '/fact_worldtempe.parquet'
dim_i94port_parquet_outputs = parquet_outputs + '/dim_i94port.parquet'
dim_i94addr_parquet_outputs = parquet_outputs + '/dim_i94addr.parquet'

#### View table structures

In [None]:
fact_i94immi_parquet_outputs_df = spark.read.parquet(fact_i94immi_parquet_outputs)
fact_i94immi_parquet_outputs_df.createOrReplaceTempView('fact_i94immi')

spark.sql("""
        SELECT *
        FROM fact_i94immi
    """).show(3)

In [None]:
dim_visa_parquet_outputs_df = spark.read.parquet(dim_visa_parquet_outputs)
dim_visa_parquet_outputs_df.createOrReplaceTempView('dim_visa')


spark.sql("""
        SELECT *
        FROM dim_visa
    """).show(3)

In [None]:
dim_immi_flight_parquet_outputs_df = spark.read.parquet(dim_immi_flight_parquet_outputs)
dim_immi_flight_parquet_outputs_df.createOrReplaceTempView('dim_immi_flight')

spark.sql("""
        SELECT *
        FROM dim_immi_flight
    """).show(3)

In [None]:
dim_immi_travaller_parquet_outputs_df = spark.read.parquet(dim_immi_travaller_parquet_outputs)
dim_immi_travaller_parquet_outputs_df.createOrReplaceTempView('dim_immi_travaller')

spark.sql("""
        SELECT *
        FROM dim_immi_travaller
    """).show(3)

In [None]:
fact_worldtempe_parquet_outputs_df = spark.read.parquet(fact_worldtempe_parquet_outputs)
fact_worldtempe_parquet_outputs_df.createOrReplaceTempView('fact_worldtempe')

spark.sql("""
        SELECT *
        FROM fact_worldtempe
    """).show(3)

In [None]:
dim_i94port_parquet_outputs_df = spark.read.parquet(dim_i94port_parquet_outputs)
dim_i94port_parquet_outputs_df.createOrReplaceTempView('dim_i94port')

spark.sql("""
        SELECT *
        FROM dim_i94port
    """).show(3)

In [None]:
dim_i94addr_parquet_outputs_df = spark.read.parquet(dim_i94addr_parquet_outputs)
dim_i94addr_parquet_outputs_df.createOrReplaceTempView('dim_i94addr')

spark.sql("""
        SELECT *
        FROM dim_i94addr
    """).show(3)

#### Verify Primarykey

In [None]:
# dim_visa
total_column = spark.sql("""
        SELECT COUNT(*)
        FROM dim_visa
    """)

In [None]:
# dim_visa verify duplicate rows by 'airline' and 'fltno'
distinct_column = dim_visa_parquet_outputs_df.select(['airline', 'fltno']).distinct().count()

In [None]:
# Result
if total_column == distinct_column:
    print("Total rows ({}) = unique rows ({})".format(total_column, distinct_column))
    print("Dim table ok")
else:
    print("Dim table not consistence!!! Please check again.!!!")

#### Verify query