In [2]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
import shutil
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long
from pathlib import Path

In [3]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

### Extracts validation pair values from SAS Labels Description

In [4]:
def get_validation_code_from_SAS_labels(sas_input_label):
    '''
    This procedure read a input SAS Labels Description and then write out validation code datasets.
    The SAS Labels Description included validation code datasets with labels: I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Parameters
    ----------
    sas_input_label : string
        The label name of validation code dataset. Its can be one of I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Returns
    -------
    validation_code_list : validation_value_pairs(tuple(str_valid_code, str_valid_value))
        The return output is a specific SAS label list of validation code value pairs.
    '''

    # Read input SAS Labels Descriptions
    with open('I94_SAS_Labels_Descriptions.SAS') as sas_validation_code:
            labels_from_sas = sas_validation_code.read()

    # Parse labels from SAS Label Description input
    sas_labels = labels_from_sas[labels_from_sas.index(sas_input_label):]
    sas_labels = sas_labels[:sas_labels.index(';')]
    
    # Processing line by line, remove separate charaters and then append value pair
    lines = sas_labels.splitlines()
    validation_code_list = []
    for line in lines:
        try:
            valid_code, valid_value = line.split('=')
            valid_code = valid_code.strip().strip("'").strip('"')
            valid_value = valid_value.strip().strip("'").strip('"').strip()
            validation_code_list.append((valid_code, valid_value))
        except:
            pass
        
    return validation_code_list

In [5]:
def extract_staging_sas_label(label):
    '''
    asdjhkjf.
    
    Parameters
    ----------
    label: 
        a string input of specific label from "SAS_Label_Descriptions.SAS"
        
    Syntax note: 
        input value in string datatype, need inside a pair of single quotes. Ex: 'I94RES', 'I94PORTS'
    
    Returns
    -------
    Dataframe of input label.
    '''
    label_name = label
    valid_code = label + "_valid_code"
    valid_value = label + "_valid_value"
    csv_output = label + "_sas_label_validation"
    parent_dir = "./"
    path = os.path.join(parent_dir, csv_output)
    # os.mkdir(path)

    schema = R([
        Fld(valid_code, Str()),
        Fld(valid_value, Str())
    ])

    df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels(label_name),
        schema=schema
    )

    shutil.rmtree(csv_output, ignore_errors=False, onerror=None)
    df.write.options(header='True', delimiter=',').csv(csv_output)
    # df.write.mode('overwrite').csv(csv_output)

    df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv(csv_output)

    print("Top 20 rows of {} ".format(csv_output))
    df.show()

    print("Count rows of {}: {} ".format(csv_output, df.count()))
    
    print("Check unique value of {}: {} ".format(csv_output, df.select(valid_code).distinct().count()))

    print("Staging csv files in: {}".format(csv_output))

    return df

In [6]:
def convert_column_names(df):
    '''
    This procedure standardizing column names to snake case format. Format ex: customer_name, billing_address, total_price.
    
    Parameters
    ----------
    dataframe : string_of_dataframe
        The input dataframe with column names might have elements of messy columns names, including accents, different delimiters, casing and multiple white spaces.
        Snake case style replaces the white spaces and symbol delimiters with underscore and converts all characters to lower case
    
    Returns
    -------
    Dataframe with column names has been changed to snake_case format.
    '''
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_")
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [7]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

### Extract and clean validation values from 'SAS_Labels_Descriptions.SAS'

In [8]:
# `I94PORT` label
I94PORT_staging_df = extract_staging_sas_label('I94PORT')
I94PORT_staging_df = I94PORT_staging_df.toPandas()
I94PORT_staging_df

Top 20 rows of I94PORT_sas_label_validation 
+------------------+--------------------+
|I94PORT_valid_code| I94PORT_valid_value|
+------------------+--------------------+
|               ORO|        OROVILLE, WA|
|               PWB|           PASCO, WA|
|               PIR|   POINT ROBERTS, WA|
|               PNG|    PORT ANGELES, WA|
|               PTO|   PORT TOWNSEND, WA|
|               SEA|         SEATTLE, WA|
|               SPO|         SPOKANE, WA|
|               SUM|           SUMAS, WA|
|               TAC|          TACOMA, WA|
|               PSC|TRI-CITIES - PASC...|
|               VAN|       VANCOUVER, WA|
|               AGM|          ALGOMA, WI|
|               BAY|        BAYFIELD, WI|
|               GRB|       GREEN BAY, WI|
|               MNW|       MANITOWOC, WI|
|               MIL|       MILWAUKEE, WI|
|               MSN|TRUAX FIELD - DAN...|
|               CHS|      CHARLESTON, WV|
|               CLK|      CLARKSBURG, WV|
|               BLF|   MERCER C

Unnamed: 0,I94PORT_valid_code,I94PORT_valid_value
0,ORO,"OROVILLE, WA"
1,PWB,"PASCO, WA"
2,PIR,"POINT ROBERTS, WA"
3,PNG,"PORT ANGELES, WA"
4,PTO,"PORT TOWNSEND, WA"
...,...,...
655,MRR,"MORRISTOWN, NY"
656,NYC,"NEW YORK, NY"
657,NIA,"NIAGARA FALLS, NY"
658,OGD,"OGDENSBURG, NY"


### Clean validation values from 'SAS_Labels_Descriptions.SAS'

Split to ***I94PORT_code, City, State*** for ***dim_i94port***

In [9]:
# clean leading and trailing white space
I94PORT_staging_df["I94PORT_valid_code"] = I94PORT_staging_df["I94PORT_valid_code"].str.lstrip().str.rstrip()
I94PORT_staging_df["I94PORT_valid_value"] = I94PORT_staging_df["I94PORT_valid_value"].str.lstrip().str.rstrip()

In [10]:
# split to port, city, state
I94PORT_staging_df["I94PORT_city_name"] = I94PORT_staging_df["I94PORT_valid_value"].str.split(",").str.get(0)
I94PORT_staging_df["I94PORT_state_code"] = I94PORT_staging_df["I94PORT_valid_value"].str.split(",").str.get(1)

In [11]:
# clean leading and trailing white space
I94PORT_staging_df["I94PORT_city_name"] = I94PORT_staging_df["I94PORT_city_name"].str.lstrip().str.rstrip()
I94PORT_staging_df["I94PORT_state_code"] = I94PORT_staging_df["I94PORT_state_code"].str.lstrip().str.rstrip()

In [12]:
I94PORT_staging_df

Unnamed: 0,I94PORT_valid_code,I94PORT_valid_value,I94PORT_city_name,I94PORT_state_code
0,ORO,"OROVILLE, WA",OROVILLE,WA
1,PWB,"PASCO, WA",PASCO,WA
2,PIR,"POINT ROBERTS, WA",POINT ROBERTS,WA
3,PNG,"PORT ANGELES, WA",PORT ANGELES,WA
4,PTO,"PORT TOWNSEND, WA",PORT TOWNSEND,WA
...,...,...,...,...
655,MRR,"MORRISTOWN, NY",MORRISTOWN,NY
656,NYC,"NEW YORK, NY",NEW YORK,NY
657,NIA,"NIAGARA FALLS, NY",NIAGARA FALLS,NY
658,OGD,"OGDENSBURG, NY",OGDENSBURG,NY


In [13]:
# count amount airport of a city
I94PORT_staging_df["I94PORT_city_name"].str.title().value_counts()

Newport                     3
Yuma                        2
Lake Charles                2
Bellingham                  2
Wilmington                  2
                           ..
Tri City Arpt               1
Addison Airport- Addison    1
Amistad Dam                 1
Anzalduas                   1
Oswego                      1
Name: I94PORT_city_name, Length: 634, dtype: int64

In [14]:
# drop missing value on I94PORT_state_code
I94PORT_staging_df = I94PORT_staging_df.dropna(subset = ["I94PORT_state_code"])
I94PORT_staging_df.head()

Unnamed: 0,I94PORT_valid_code,I94PORT_valid_value,I94PORT_city_name,I94PORT_state_code
0,ORO,"OROVILLE, WA",OROVILLE,WA
1,PWB,"PASCO, WA",PASCO,WA
2,PIR,"POINT ROBERTS, WA",POINT ROBERTS,WA
3,PNG,"PORT ANGELES, WA",PORT ANGELES,WA
4,PTO,"PORT TOWNSEND, WA",PORT TOWNSEND,WA


In [15]:
convert_column_names(I94PORT_staging_df)
I94PORT_staging_df.columns

Index(['i94port_valid_code', 'i94port_valid_value', 'i94port_city_name',
       'i94port_state_code'],
      dtype='object')

In [18]:
select_cols = ['i94port_valid_code', 'i94port_city_name', 'i94port_state_code']
I94PORT_staging_df = I94PORT_staging_df[select_cols]
I94PORT_staging_df.head()

Unnamed: 0,i94port_valid_code,i94port_city_name,i94port_state_code
0,ORO,OROVILLE,WA
1,PWB,PASCO,WA
2,PIR,POINT ROBERTS,WA
3,PNG,PORT ANGELES,WA
4,PTO,PORT TOWNSEND,WA


In [None]:
I94PORT_staging_df = spark.createDataFrame(I94PORT_staging_df)

In [21]:
I94PORT_staging_df.printSchema()
I94PORT_staging_df.show()

root
 |-- i94port_valid_code: string (nullable = true)
 |-- i94port_city_name: string (nullable = true)
 |-- i94port_state_code: string (nullable = true)

+------------------+--------------------+------------------+
|i94port_valid_code|   i94port_city_name|i94port_state_code|
+------------------+--------------------+------------------+
|               ORO|            OROVILLE|                WA|
|               PWB|               PASCO|                WA|
|               PIR|       POINT ROBERTS|                WA|
|               PNG|        PORT ANGELES|                WA|
|               PTO|       PORT TOWNSEND|                WA|
|               SEA|             SEATTLE|                WA|
|               SPO|             SPOKANE|                WA|
|               SUM|               SUMAS|                WA|
|               TAC|              TACOMA|                WA|
|               PSC|  TRI-CITIES - PASCO|                WA|
|               VAN|           VANCOUVER|           

### Staging validation values from 'SAS_Labels_Descriptions.SAS'

In [None]:
rmdir(Path("i94port_staging"))

In [23]:
I94PORT_staging_df.write.options(header='True', delimiter=',').csv("i94port_staging")
# I94PORT_staging_df.write.options(header='True', delimiter=',').csv("i94port_staging")
# I94PORT_staging_df.write.csv("i94port_staging.csv")

In [55]:
# Staging `I94PORT` label
# I94PORT_staging_df.to_csv("i94port_staging.csv", index = False, header=True)

In [24]:
#i94port_df = spark.read.csv("i94port_staging.csv", header=True)
# i94port_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94port_staging.csv")
i94port_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("i94port_staging")

In [25]:
i94port_df.show()

+------------------+--------------------+------------------+
|i94port_valid_code|   i94port_city_name|i94port_state_code|
+------------------+--------------------+------------------+
|               ORO|            OROVILLE|                WA|
|               PWB|               PASCO|                WA|
|               PIR|       POINT ROBERTS|                WA|
|               PNG|        PORT ANGELES|                WA|
|               PTO|       PORT TOWNSEND|                WA|
|               SEA|             SEATTLE|                WA|
|               SPO|             SPOKANE|                WA|
|               SUM|               SUMAS|                WA|
|               TAC|              TACOMA|                WA|
|               PSC|  TRI-CITIES - PASCO|                WA|
|               VAN|           VANCOUVER|                WA|
|               AGM|              ALGOMA|                WI|
|               BAY|            BAYFIELD|                WI|
|               GRB|    

In [26]:
# Create table from dataframe
i94port_df.createOrReplaceTempView('i94port_table')

In [27]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94port_table
""").show()

+--------+
|count(1)|
+--------+
|     583|
+--------+

