In [170]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
import shutil
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long
from pathlib import Path

In [171]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

### Extracts validation pair values from SAS Labels Description

In [172]:
def get_validation_code_from_SAS_labels(sas_input_label):
    '''
    This procedure read a input SAS Labels Description and then write out validation code datasets.
    The SAS Labels Description included validation code datasets with labels: I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Parameters
    ----------
    sas_input_label : string
        The label name of validation code dataset. Its can be one of I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Returns
    -------
    validation_code_list : validation_value_pairs(tuple(str_valid_code, str_valid_value))
        The return output is a specific SAS label list of validation code value pairs.
    '''

    # Read input SAS Labels Descriptions
    with open('I94_SAS_Labels_Descriptions.SAS') as sas_validation_code:
            labels_from_sas = sas_validation_code.read()

    # Parse labels from SAS Label Description input
    sas_labels = labels_from_sas[labels_from_sas.index(sas_input_label):]
    sas_labels = sas_labels[:sas_labels.index(';')]
    
    # Processing line by line, remove separate charaters and then append value pair
    lines = sas_labels.splitlines()
    validation_code_list = []
    for line in lines:
        try:
            valid_code, valid_value = line.split('=')
            valid_code = valid_code.strip().strip("'").strip('"')
            valid_value = valid_value.strip().strip("'").strip('"').strip()
            validation_code_list.append((valid_code, valid_value))
        except:
            pass
        
    return validation_code_list

In [173]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

In [174]:
def extract_staging_sas_label(label):
    '''
    asdjhkjf.
    
    Parameters
    ----------
    label: 
        a string input of specific label from "SAS_Label_Descriptions.SAS"
        
    Syntax note: 
        input value in string datatype, need inside a pair of single quotes. Ex: 'I94RES', 'I94PORTS'
    
    Returns
    -------
    Dataframe of input label.
    '''
    label_name = label
    valid_code = label + "_valid_code"
    valid_value = label + "_valid_value"
    csv_output = label + "_sas_label_validation"

    schema = R([
        Fld(valid_code, Str()),
        Fld(valid_value, Str())
    ])

    df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels(label_name),
        schema=schema
    )

    # rmdir(Path(csv_output))
    shutil.rmtree(csv_output, ignore_errors=False, onerror=None)
    df.write.options(header='True', delimiter=',', mode='overwrite').csv(csv_output)
    # df.write.mode('overwrite').csv(csv_output)

    df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv(csv_output)

    print("Top 20 rows of {} ".format(csv_output))
    df.show()

    print("Count rows of {}: {} ".format(csv_output, df.count()))
    
    print("Check unique value of {}: {} ".format(csv_output, df.select(valid_code).distinct().count()))

    print("Staging csv files in: {}".format(csv_output))

    return df

### `I94RES` label - Extract and staging validation values from 'SAS_Labels_Descriptions.SAS'

In [175]:
I94RES_df = extract_staging_sas_label('I94RES')

Top 20 rows of I94RES_sas_label_validation 
+-----------------+--------------------+
|I94RES_valid_code|  I94RES_valid_value|
+-----------------+--------------------+
|              527|TURKS AND CAICOS ...|
|              420|              TUVALU|
|              352|              UGANDA|
|              162|             UKRAINE|
|              296|UNITED ARAB EMIRATES|
|              135|      UNITED KINGDOM|
|              695|             URUGUAY|
|              163|          UZBEKISTAN|
|              410|             VANUATU|
|              696|           VENEZUELA|
|              266|             VIETNAM|
|              469|WALLIS AND FUTUNA...|
|              757|WEST INDIES (FRENCH)|
|              333|      WESTERN SAHARA|
|              465|       WESTERN SAMOA|
|              216|               YEMEN|
|              139|          YUGOSLAVIA|
|              301|               ZAIRE|
|              344|              ZAMBIA|
|              315|            ZIMBABWE|
+------------

In [142]:
I94RES_df.select('i94res_valid_code').distinct().collect()

[Row(i94res_valid_code=471),
 Row(i94res_valid_code=737),
 Row(i94res_valid_code=296),
 Row(i94res_valid_code=300),
 Row(i94res_valid_code=731),
 Row(i94res_valid_code=914),
 Row(i94res_valid_code=501),
 Row(i94res_valid_code=333),
 Row(i94res_valid_code=319),
 Row(i94res_valid_code=190),
 Row(i94res_valid_code=696),
 Row(i94res_valid_code=139),
 Row(i94res_valid_code=944),
 Row(i94res_valid_code=54),
 Row(i94res_valid_code=266),
 Row(i94res_valid_code=163),
 Row(i94res_valid_code=849),
 Row(i94res_valid_code=505),
 Row(i94res_valid_code=695),
 Row(i94res_valid_code=583),
 Row(i94res_valid_code=503),
 Row(i94res_valid_code=739),
 Row(i94res_valid_code=485),
 Row(i94res_valid_code=301),
 Row(i94res_valid_code=757),
 Row(i94res_valid_code=100),
 Row(i94res_valid_code=719),
 Row(i94res_valid_code=791),
 Row(i94res_valid_code=162),
 Row(i94res_valid_code=238),
 Row(i94res_valid_code=720),
 Row(i94res_valid_code=465),
 Row(i94res_valid_code=277),
 Row(i94res_valid_code=400),
 Row(i94res_val

In [143]:
I94RES_df = I94RES_df.withColumn("length", F.length("I94RES_valid_code"))
I94RES_df.printSchema()
I94RES_df.show(truncate=False)

root
 |-- I94RES_valid_code: integer (nullable = true)
 |-- I94RES_valid_value: string (nullable = true)
 |-- length: integer (nullable = true)

+-----------------+-------------------------+------+
|I94RES_valid_code|I94RES_valid_value       |length|
+-----------------+-------------------------+------+
|527              |TURKS AND CAICOS ISLANDS |3     |
|420              |TUVALU                   |3     |
|352              |UGANDA                   |3     |
|162              |UKRAINE                  |3     |
|296              |UNITED ARAB EMIRATES     |3     |
|135              |UNITED KINGDOM           |3     |
|695              |URUGUAY                  |3     |
|163              |UZBEKISTAN               |3     |
|410              |VANUATU                  |3     |
|696              |VENEZUELA                |3     |
|266              |VIETNAM                  |3     |
|469              |WALLIS AND FUTUNA ISLANDS|3     |
|757              |WEST INDIES (FRENCH)     |3     |
|333   

In [144]:
I94RES_df.createOrReplaceTempView('i94res_table')

In [145]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94res_rows
    FROM i94res_table
""").show()

+------------------+
|amount_i94res_rows|
+------------------+
|               289|
+------------------+



### `I94PORT` label - Extract and staging validation values from 'SAS_Labels_Descriptions.SAS'

In [176]:
I94PORT_df = extract_staging_sas_label('I94PORT')

Top 20 rows of I94PORT_sas_label_validation 
+------------------+--------------------+
|I94PORT_valid_code| I94PORT_valid_value|
+------------------+--------------------+
|               ORO|        OROVILLE, WA|
|               PWB|           PASCO, WA|
|               PIR|   POINT ROBERTS, WA|
|               PNG|    PORT ANGELES, WA|
|               PTO|   PORT TOWNSEND, WA|
|               SEA|         SEATTLE, WA|
|               SPO|         SPOKANE, WA|
|               SUM|           SUMAS, WA|
|               TAC|          TACOMA, WA|
|               PSC|TRI-CITIES - PASC...|
|               VAN|       VANCOUVER, WA|
|               AGM|          ALGOMA, WI|
|               BAY|        BAYFIELD, WI|
|               GRB|       GREEN BAY, WI|
|               MNW|       MANITOWOC, WI|
|               MIL|       MILWAUKEE, WI|
|               MSN|TRUAX FIELD - DAN...|
|               CHS|      CHARLESTON, WV|
|               CLK|      CLARKSBURG, WV|
|               BLF|   MERCER C

In [177]:
I94PORT_df.select('I94PORT_valid_code').distinct().collect()

[Row(I94PORT_valid_code='UIO'),
 Row(I94PORT_valid_code='FOK'),
 Row(I94PORT_valid_code='LND'),
 Row(I94PORT_valid_code='GRB'),
 Row(I94PORT_valid_code='CLG'),
 Row(I94PORT_valid_code='ZZZ'),
 Row(I94PORT_valid_code='NGL'),
 Row(I94PORT_valid_code='AUH'),
 Row(I94PORT_valid_code='NAS'),
 Row(I94PORT_valid_code='RIV'),
 Row(I94PORT_valid_code='NK'),
 Row(I94PORT_valid_code='VMB'),
 Row(I94PORT_valid_code='MSN'),
 Row(I94PORT_valid_code='MAL'),
 Row(I94PORT_valid_code='OTT'),
 Row(I94PORT_valid_code='DRV'),
 Row(I94PORT_valid_code='CLX'),
 Row(I94PORT_valid_code='ARB'),
 Row(I94PORT_valid_code='OSN'),
 Row(I94PORT_valid_code='GRU'),
 Row(I94PORT_valid_code='OLM'),
 Row(I94PORT_valid_code='TAC'),
 Row(I94PORT_valid_code='BUS'),
 Row(I94PORT_valid_code='MTH'),
 Row(I94PORT_valid_code='FOU'),
 Row(I94PORT_valid_code='IAG'),
 Row(I94PORT_valid_code='ADU'),
 Row(I94PORT_valid_code='888'),
 Row(I94PORT_valid_code='FSC'),
 Row(I94PORT_valid_code='PTO'),
 Row(I94PORT_valid_code='VCV'),
 Row(I94P

In [178]:
I94PORT_df = I94PORT_df.withColumn("length", F.length("I94PORT_valid_code"))
I94PORT_df.printSchema()
I94PORT_df.show(truncate=False)

root
 |-- I94PORT_valid_code: string (nullable = true)
 |-- I94PORT_valid_value: string (nullable = true)
 |-- length: integer (nullable = true)

+------------------+-----------------------------+------+
|I94PORT_valid_code|I94PORT_valid_value          |length|
+------------------+-----------------------------+------+
|ORO               |OROVILLE, WA                 |3     |
|PWB               |PASCO, WA                    |3     |
|PIR               |POINT ROBERTS, WA            |3     |
|PNG               |PORT ANGELES, WA             |3     |
|PTO               |PORT TOWNSEND, WA            |3     |
|SEA               |SEATTLE, WA                  |3     |
|SPO               |SPOKANE, WA                  |3     |
|SUM               |SUMAS, WA                    |3     |
|TAC               |TACOMA, WA                   |3     |
|PSC               |TRI-CITIES - PASCO, WA       |3     |
|VAN               |VANCOUVER, WA                |3     |
|AGM               |ALGOMA, WI            

In [179]:
# Create table from dataframe
I94PORT_df.createOrReplaceTempView('i94ports_table')

In [180]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94ports_rows
    FROM i94ports_table
""").show()

+--------------------+
|amount_i94ports_rows|
+--------------------+
|                 660|
+--------------------+



### `I94ADDR` label - Extract and staging validation values from 'SAS_Labels_Descriptions.SAS'

In [181]:
I94ADDR_df = extract_staging_sas_label('I94ADDR')

Top 20 rows of I94ADDR_sas_label_validation 
+------------------+-------------------+
|I94ADDR_valid_code|I94ADDR_valid_value|
+------------------+-------------------+
|                PA|       PENNSYLVANIA|
|                PR|        PUERTO RICO|
|                RI|       RHODE ISLAND|
|                SC|        S. CAROLINA|
|                SD|          S. DAKOTA|
|                TN|          TENNESSEE|
|                TX|              TEXAS|
|                UT|               UTAH|
|                VT|            VERMONT|
|                VI|     VIRGIN ISLANDS|
|                VA|           VIRGINIA|
|                WV|        W. VIRGINIA|
|                WA|         WASHINGTON|
|                WI|          WISCONSON|
|                WY|            WYOMING|
|                99|    All Other Codes|
|                MO|           MISSOURI|
|                MT|            MONTANA|
|                NC|        N. CAROLINA|
|                ND|          N. DAKOTA|
+-----------

In [182]:
I94ADDR_df.select('I94ADDR_valid_code').distinct().collect()

[Row(I94ADDR_valid_code='SC'),
 Row(I94ADDR_valid_code='99'),
 Row(I94ADDR_valid_code='VA'),
 Row(I94ADDR_valid_code='RI'),
 Row(I94ADDR_valid_code='WY'),
 Row(I94ADDR_valid_code='WI'),
 Row(I94ADDR_valid_code='VT'),
 Row(I94ADDR_valid_code='VI'),
 Row(I94ADDR_valid_code='WA'),
 Row(I94ADDR_valid_code='TN'),
 Row(I94ADDR_valid_code='PA'),
 Row(I94ADDR_valid_code='SD'),
 Row(I94ADDR_valid_code='TX'),
 Row(I94ADDR_valid_code='WV'),
 Row(I94ADDR_valid_code='PR'),
 Row(I94ADDR_valid_code='UT'),
 Row(I94ADDR_valid_code='NJ'),
 Row(I94ADDR_valid_code='OR'),
 Row(I94ADDR_valid_code='NH'),
 Row(I94ADDR_valid_code='NV'),
 Row(I94ADDR_valid_code='NE'),
 Row(I94ADDR_valid_code='MT'),
 Row(I94ADDR_valid_code='NC'),
 Row(I94ADDR_valid_code='MO'),
 Row(I94ADDR_valid_code='ND'),
 Row(I94ADDR_valid_code='OH'),
 Row(I94ADDR_valid_code='NM'),
 Row(I94ADDR_valid_code='NY'),
 Row(I94ADDR_valid_code='OK'),
 Row(I94ADDR_valid_code='AZ'),
 Row(I94ADDR_valid_code='DC'),
 Row(I94ADDR_valid_code='CA'),
 Row(I94

In [183]:
I94ADDR_df = I94ADDR_df.withColumn("length", F.length("I94ADDR_valid_code"))
I94ADDR_df.printSchema()
I94ADDR_df.show(truncate=False)

root
 |-- I94ADDR_valid_code: string (nullable = true)
 |-- I94ADDR_valid_value: string (nullable = true)
 |-- length: integer (nullable = true)

+------------------+-------------------+------+
|I94ADDR_valid_code|I94ADDR_valid_value|length|
+------------------+-------------------+------+
|PA                |PENNSYLVANIA       |2     |
|PR                |PUERTO RICO        |2     |
|RI                |RHODE ISLAND       |2     |
|SC                |S. CAROLINA        |2     |
|SD                |S. DAKOTA          |2     |
|TN                |TENNESSEE          |2     |
|TX                |TEXAS              |2     |
|UT                |UTAH               |2     |
|VT                |VERMONT            |2     |
|VI                |VIRGIN ISLANDS     |2     |
|VA                |VIRGINIA           |2     |
|WV                |W. VIRGINIA        |2     |
|WA                |WASHINGTON         |2     |
|WI                |WISCONSON          |2     |
|WY                |WYOMING           

In [184]:
# Create table from dataframe
I94ADDR_df.createOrReplaceTempView('i94addr_table')

In [185]:
# Verify created table will be using for staging
spark.sql("""
        SELECT COUNT(*) as amount_i94addr_rows
        FROM i94addr_table
""").show()

+-------------------+
|amount_i94addr_rows|
+-------------------+
|                 55|
+-------------------+



### `I94MODE` label - Extract and staging validation values from 'SAS_Labels_Descriptions.SAS'

In [186]:
I94MODE_df = extract_staging_sas_label('I94MODE')

Top 20 rows of I94MODE_sas_label_validation 
+------------------+-------------------+
|I94MODE_valid_code|I94MODE_valid_value|
+------------------+-------------------+
|                 9|       Not reported|
|                 3|               Land|
|                 1|                Air|
|                 2|                Sea|
+------------------+-------------------+

Count rows of I94MODE_sas_label_validation: 4 
Check unique value of I94MODE_sas_label_validation: 4 
Staging csv files in: I94MODE_sas_label_validation


In [187]:
I94MODE_df.select('I94MODE_valid_code').distinct().collect()

[Row(I94MODE_valid_code=9),
 Row(I94MODE_valid_code=3),
 Row(I94MODE_valid_code=1),
 Row(I94MODE_valid_code=2)]

In [188]:
I94MODE_df = I94MODE_df.withColumn("length", F.length("I94MODE_valid_code"))
I94MODE_df.printSchema()
I94MODE_df.show(truncate=False)

root
 |-- I94MODE_valid_code: integer (nullable = true)
 |-- I94MODE_valid_value: string (nullable = true)
 |-- length: integer (nullable = true)

+------------------+-------------------+------+
|I94MODE_valid_code|I94MODE_valid_value|length|
+------------------+-------------------+------+
|9                 |Not reported       |1     |
|3                 |Land               |1     |
|1                 |Air                |1     |
|2                 |Sea                |1     |
+------------------+-------------------+------+



In [189]:
# Create table from dataframe
I94MODE_df.createOrReplaceTempView('i94mode_table')

In [190]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94mode_rows
    FROM i94mode_table
""").show()

+-------------------+
|amount_i94mode_rows|
+-------------------+
|                  4|
+-------------------+



### `I94VISA` label - Extract and staging validation values from 'SAS_Labels_Descriptions.SAS'

In [191]:
I94VISA_df = extract_staging_sas_label('I94VISA')

Top 20 rows of I94VISA_sas_label_validation 
+------------------+-------------------+
|I94VISA_valid_code|I94VISA_valid_value|
+------------------+-------------------+
|                 1|           Business|
|                 2|           Pleasure|
|                 3|            Student|
+------------------+-------------------+

Count rows of I94VISA_sas_label_validation: 3 
Check unique value of I94VISA_sas_label_validation: 3 
Staging csv files in: I94VISA_sas_label_validation


In [192]:
I94VISA_df.select('I94VISA_valid_code').distinct().collect()

[Row(I94VISA_valid_code=1),
 Row(I94VISA_valid_code=2),
 Row(I94VISA_valid_code=3)]

In [193]:
I94VISA_df = I94VISA_df.withColumn("length", F.length("I94VISA_valid_code"))
I94VISA_df.printSchema()
I94VISA_df.show(truncate=False)

root
 |-- I94VISA_valid_code: integer (nullable = true)
 |-- I94VISA_valid_value: string (nullable = true)
 |-- length: integer (nullable = true)

+------------------+-------------------+------+
|I94VISA_valid_code|I94VISA_valid_value|length|
+------------------+-------------------+------+
|1                 |Business           |1     |
|2                 |Pleasure           |1     |
|3                 |Student            |1     |
+------------------+-------------------+------+



In [194]:
# Create table from dataframe
I94VISA_df.createOrReplaceTempView('i94visa_table')

In [195]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94visa_rows
    FROM i94visa_table
""").show()

+-------------------+
|amount_i94visa_rows|
+-------------------+
|                  3|
+-------------------+

