In [87]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long
from pathlib import Path

In [77]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

### Procedure extracts validation pair values from SAS Labels Description

In [78]:
def get_validation_code_from_SAS_labels(sas_input_label):
    '''
    This procedure read a input SAS Labels Description and then write out validation code datasets.
    The SAS Labels Description included validation code datasets with labels: I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Parameters
    ----------
    sas_input_label : string
        The label name of validation code dataset. Its can be one of I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Returns
    -------
    validation_code_list : validation_value_pairs(tuple(str_valid_code, str_valid_value))
        The return output is a specific SAS label list of validation code value pairs.
    '''

    # Read input SAS Labels Descriptions
    with open('I94_SAS_Labels_Descriptions.SAS') as sas_validation_code:
            labels_from_sas = sas_validation_code.read()

    # Parse labels from SAS Label Description input
    sas_labels = labels_from_sas[labels_from_sas.index(sas_input_label):]
    sas_labels = sas_labels[:sas_labels.index(';')]
    
    # Processing line by line, remove separate charaters and then append value pair
    lines = sas_labels.splitlines()
    validation_code_list = []
    for line in lines:
        try:
            valid_code, valid_value = line.split('=')
            valid_code = valid_code.strip().strip("'").strip('"')
            valid_value = valid_value.strip().strip("'").strip('"').strip()
            validation_code_list.append((valid_code, valid_value))
        except:
            pass
        
    return validation_code_list

In [98]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

In [96]:
# Schema for validation value and code pairs. This schema is using for all labels of SAS Labels Description
schema = R([
        Fld("valid_code", Str()),
        Fld("valid_value", Str())
    ])

### Extract validation values from `I94RES` label

In [97]:
i94res_df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels('I94RES'),
        schema=schema
)

In [81]:
i94res_df.show()

+----------+--------------------+
|valid_code|         valid_value|
+----------+--------------------+
|       582|MEXICO Air Sea, a...|
|       236|         AFGHANISTAN|
|       101|             ALBANIA|
|       316|             ALGERIA|
|       102|             ANDORRA|
|       324|              ANGOLA|
|       529|            ANGUILLA|
|       518|     ANTIGUA-BARBUDA|
|       687|           ARGENTINA|
|       151|             ARMENIA|
|       532|               ARUBA|
|       438|           AUSTRALIA|
|       103|             AUSTRIA|
|       152|          AZERBAIJAN|
|       512|             BAHAMAS|
|       298|             BAHRAIN|
|       274|          BANGLADESH|
|       513|            BARBADOS|
|       104|             BELGIUM|
|       581|              BELIZE|
+----------+--------------------+
only showing top 20 rows



In [82]:
i94res_df.count()

289

In [83]:
i94res_df.toPandas()['valid_code'].unique()

array(['582', '236', '101', '316', '102', '324', '529', '518', '687',
       '151', '532', '438', '103', '152', '512', '298', '274', '513',
       '104', '581', '386', '509', '153', '242', '688', '717', '164',
       '336', '689', '525', '217', '105', '393', '243', '375', '310',
       '326', '526', '383', '384', '690', '245', '721', '270', '271',
       '691', '317', '385', '467', '575', '165', '584', '218', '140',
       '723', '108', '322', '519', '585', '240', '692', '368', '576',
       '399', '372', '109', '369', '604', '413', '110', '111', '601',
       '411', '387', '338', '758', '154', '112', '339', '143', '113',
       '520', '507', '577', '382', '327', '603', '586', '726', '149',
       '528', '206', '114', '115', '213', '759', '729', '204', '249',
       '250', '116', '251', '117', '388', '514', '209', '253', '201',
       '155', '340', '414', '732', '272', '156', '203', '118', '255',
       '335', '370', '381', '119', '120', '121', '214', '167', '320',
       '345', '273',

In [84]:
len(i94res_df.toPandas()['valid_code'].unique())

289

In [94]:
rmdir(Path("i94res_sas_label_validation"))
i94res_df.write.options(header='True', delimiter=',').csv("i94res_sas_label_validation")
# i94res_df.write.mode('overwrite').csv("i94res_sas_label_validation")

### Staging cleaned `I94REST` from saved csv partitions

In [None]:
# Read out from csv partitions to staging dataframe
i94res_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94res_sas_label_validation")

In [None]:
# Verify loaded dataframe
i94res_df.show()

In [None]:
# Create table from dataframe
i94res_df.createOrReplaceTempView('i94res_table')

In [None]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94res_rows
    FROM i94res_table
""").show()

### Extract validation values from `I94PORT` label

In [11]:
i94ports_df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels('I94PORT'),
        schema=schema
)

In [12]:
i94ports_df.show()

+----------+--------------------+
|valid_code|         valid_value|
+----------+--------------------+
|       ALC|           ALCAN, AK|
|       ANC|       ANCHORAGE, AK|
|       BAR|BAKER AAF - BAKER...|
|       DAC|   DALTONS CACHE, AK|
|       PIZ|DEW STATION PT LA...|
|       DTH|    DUTCH HARBOR, AK|
|       EGL|           EAGLE, AK|
|       FRB|       FAIRBANKS, AK|
|       HOM|           HOMER, AK|
|       HYD|           HYDER, AK|
|       JUN|          JUNEAU, AK|
|       5KE|       KETCHIKAN, AK|
|       KET|       KETCHIKAN, AK|
|       MOS|MOSES POINT INTER...|
|       NIK|         NIKISKI, AK|
|       NOM|             NOM, AK|
|       PKC|     POKER CREEK, AK|
|       ORI|  PORT LIONS SPB, AK|
|       SKA|         SKAGWAY, AK|
|       SNP| ST. PAUL ISLAND, AK|
+----------+--------------------+
only showing top 20 rows



In [16]:
i94ports_df.count()

660

In [17]:
i94ports_df.toPandas()['valid_code'].unique()

array(['ALC', 'ANC', 'BAR', 'DAC', 'PIZ', 'DTH', 'EGL', 'FRB', 'HOM',
       'HYD', 'JUN', '5KE', 'KET', 'MOS', 'NIK', 'NOM', 'PKC', 'ORI',
       'SKA', 'SNP', 'TKI', 'WRA', 'HSV', 'MOB', 'LIA', 'ROG', 'DOU',
       'LUK', 'MAP', 'NAC', 'NOG', 'PHO', 'POR', 'SLU', 'SAS', 'TUC',
       'YUI', 'AND', 'BUR', 'CAL', 'CAO', 'FRE', 'ICP', 'LNB', 'LOS',
       'BFL', 'OAK', 'ONT', 'OTM', 'BLT', 'PSP', 'SAC', 'SLS', 'SDP',
       'SFR', 'SNJ', 'SLO', 'SLI', 'SPC', 'SYS', 'SAA', 'STO', 'TEC',
       'TRV', 'APA', 'ASE', 'COS', 'DEN', 'DRO', 'BDL', 'BGC', 'GRT',
       'HAR', 'NWH', 'NWL', 'TST', 'WAS', 'DOV', 'DVD', 'WLL', 'BOC',
       'SRQ', 'CAN', 'DAB', 'FRN', 'FTL', 'FMY', 'FPF', 'HUR', 'GNV',
       'JAC', 'KEY', 'LEE', 'MLB', 'MIA', 'APF', 'OPF', 'ORL', 'PAN',
       'PEN', 'PCF', 'PEV', 'PSJ', 'SFB', 'SGJ', 'SAU', 'FPR', 'SPE',
       'TAM', 'WPB', 'ATL', 'BRU', 'AGS', 'SAV', 'AGA', 'HHW', 'OGG',
       'KOA', 'LIH', 'CID', 'DSM', 'BOI', 'EPI', 'IDA', 'PTL', 'SPI',
       'CHI', 'DPA',

In [18]:
len(i94ports_df.toPandas()['valid_code'].unique())

660

In [19]:
rmdir(Path("i94ports_sas_label_validation"))
i94ports_df.write.options(header='True', delimiter=',').csv("i94ports_sas_label_validation")
# i94ports_df.write.mode('overwrite').csv("i94ports_sas_label_validation")

### Staging cleaned `I94PORTS` from saved csv partitions

In [None]:
# Read out from csv partitions to staging dataframe
i94ports_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94ports_sas_label_validation")

In [None]:
# Verify loaded dataframe
i94ports_df.show()

In [None]:
# Create table from dataframe
i94ports_df.createOrReplaceTempView('i94ports_table')

In [None]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94ports_rows
    FROM i94ports_table
""").show()

### Extract validation values from `I94ADDR` label

In [20]:
i94addr_df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels('I94ADDR'),
        schema=schema
)

In [21]:
i94addr_df.show()

+----------+-----------------+
|valid_code|      valid_value|
+----------+-----------------+
|        AL|          ALABAMA|
|        AK|           ALASKA|
|        AZ|          ARIZONA|
|        AR|         ARKANSAS|
|        CA|       CALIFORNIA|
|        CO|         COLORADO|
|        CT|      CONNECTICUT|
|        DE|         DELAWARE|
|        DC|DIST. OF COLUMBIA|
|        FL|          FLORIDA|
|        GA|          GEORGIA|
|        GU|             GUAM|
|        HI|           HAWAII|
|        ID|            IDAHO|
|        IL|         ILLINOIS|
|        IN|          INDIANA|
|        IA|             IOWA|
|        KS|           KANSAS|
|        KY|         KENTUCKY|
|        LA|        LOUISIANA|
+----------+-----------------+
only showing top 20 rows



In [22]:
i94addr_df.count()

55

In [23]:
i94addr_df.toPandas()['valid_code'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
       'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NC', 'ND', 'NE', 'NV', 'NH',
       'NJ', 'NM', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD',
       'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WV', 'WA', 'WI', 'WY', '99'],
      dtype=object)

In [24]:
len(i94addr_df.toPandas()['valid_code'].unique())

55

In [25]:
rmdir(Path("i94addr_sas_label_validation"))
i94addr_df.write.options(header='True', delimiter=',').csv("i94addr_sas_label_validation")
# i94addr_df.write.mode('overwrite').csv("i94addr_sas_label_validation")

### Staging cleaned `I94ADDR` from saved csv partitions

In [None]:
# Read out from csv partitions to staging dataframe
i94addr_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94addr_sas_label_validation")

In [None]:
# Verify loaded dataframe
i94addr_df.show()

In [None]:
# Create table from dataframe
i94addr_df.createOrReplaceTempView('i94addr_table')

In [None]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94addr_rows
    FROM i94addr_table
""").show()

### Extract validation values from `I94MODE` label

In [26]:
i94mode_df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels('I94MODE'),
        schema=schema
)

In [27]:
i94mode_df.show()

+----------+------------+
|valid_code| valid_value|
+----------+------------+
|         1|         Air|
|         2|         Sea|
|         3|        Land|
|         9|Not reported|
+----------+------------+



In [28]:
i94mode_df.count()

4

In [29]:
i94mode_df.toPandas()['valid_code'].unique()

array(['1', '2', '3', '9'], dtype=object)

In [30]:
len(i94mode_df.toPandas()['valid_code'].unique())

4

In [31]:
rmdir(Path("i94mode_sas_label_validation"))
i94mode_df.write.options(header='True', delimiter=',').csv("i94mode_sas_label_validation")
# i94mode_df.write.mode('overwrite').csv("i94mode_sas_label_validation")

### Staging cleaned `I94MODE` from saved csv partitions

In [None]:
# Read out from csv partitions to staging dataframe
i94mode_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94mode_sas_label_validation")

In [None]:
# Verify loaded dataframe
i94mode_df.show()

In [None]:
# Create table from dataframe
i94mode_df.createOrReplaceTempView('i94mode_table')

In [None]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94mode_rows
    FROM i94mode_table
""").show()

### Extract validation values from `I94VISA` label

In [32]:
i94visa_df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels('I94VISA'),
        schema=schema
)

In [33]:
i94visa_df.show()

+----------+-----------+
|valid_code|valid_value|
+----------+-----------+
|         1|   Business|
|         2|   Pleasure|
|         3|    Student|
+----------+-----------+



In [34]:
i94visa_df.count()

3

In [35]:
i94visa_df.toPandas()['valid_code'].unique()

array(['1', '2', '3'], dtype=object)

In [36]:
len(i94visa_df.toPandas()['valid_code'].unique())

3

In [37]:
rmdir(Path("i94visa_sas_label_validation"))
i94visa_df.write.options(header='True', delimiter=',').csv("i94visa_sas_label_validation")
# i94visa_df.write.mode('overwrite').csv("i94visa_sas_label_validation")

### Staging cleaned `I94VISA` from saved csv partitions

In [75]:
# Read out from csv partitions to staging dataframe
i94visa_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94visa_sas_label_validation")

In [72]:
# Verify loaded dataframe
i94visa_df.show()

+----------+-----------+
|valid_code|valid_value|
+----------+-----------+
|         1|   Business|
|         2|   Pleasure|
|         3|    Student|
+----------+-----------+



In [73]:
# Create table from dataframe
i94visa_df.createOrReplaceTempView('i94visa_table')

In [74]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_i94visa_rows
    FROM i94visa_table
""").show()

+-------------------+
|amount_i94visa_rows|
+-------------------+
|                  3|
+-------------------+

