In [None]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
import shutil
from pyspark.sql.types import StructType as R, StructField as Fld,\
    DoubleType as Dbl, StringType as Str, IntegerType as Int,\
    TimestampType as Timestamp, DateType as Date, LongType as Long
from pathlib import Path

In [None]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

### Extracts validation pair values from SAS Labels Description

In [None]:
def get_validation_code_from_SAS_labels(sas_input_label):
    '''
    This procedure read a input SAS Labels Description and then write out validation code datasets.
    The SAS Labels Description included validation code datasets with labels: I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Parameters
    ----------
    sas_input_label : string
        The label name of validation code dataset. Its can be one of I94RES (same to I94CIT), I94PORT, I94ADDR, I94MODE, I94VISA.
    
    Returns
    -------
    validation_code_list : validation_value_pairs(tuple(str_valid_code, str_valid_value))
        The return output is a specific SAS label list of validation code value pairs.
    '''

    # Read input SAS Labels Descriptions
    with open('I94_SAS_Labels_Descriptions.SAS') as sas_validation_code:
            labels_from_sas = sas_validation_code.read()

    # Parse labels from SAS Label Description input
    sas_labels = labels_from_sas[labels_from_sas.index(sas_input_label):]
    sas_labels = sas_labels[:sas_labels.index(';')]
    
    # Processing line by line, remove separate charaters and then append value pair
    lines = sas_labels.splitlines()
    validation_code_list = []
    for line in lines:
        try:
            valid_code, valid_value = line.split('=')
            valid_code = valid_code.strip().strip("'").strip('"')
            valid_value = valid_value.strip().strip("'").strip('"').strip()
            validation_code_list.append((valid_code, valid_value))
        except:
            pass
        
    return validation_code_list

In [None]:
def extract_staging_sas_label(label):
    '''
    asdjhkjf.
    
    Parameters
    ----------
    label: 
        a string input of specific label from "SAS_Label_Descriptions.SAS"
        
    Syntax note: 
        input value in string datatype, need inside a pair of single quotes. Ex: 'I94RES', 'I94PORTS'
    
    Returns
    -------
    Dataframe of input label.
    '''
    label_name = label
    valid_code = label + "_valid_code"
    valid_value = label + "_valid_value"
    csv_output = label + "_sas_label_validation"
    parent_dir = "./"
    path = os.path.join(parent_dir, csv_output)
    # os.mkdir(path)

    schema = R([
        Fld(valid_code, Str()),
        Fld(valid_value, Str())
    ])

    df = spark.createDataFrame(
        data=get_validation_code_from_SAS_labels(label_name),
        schema=schema
    )

    shutil.rmtree(csv_output, ignore_errors=False, onerror=None)
    df.write.options(header='True', delimiter=',').csv(csv_output)
    # df.write.mode('overwrite').csv(csv_output)

    df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv(csv_output)

    print("Top 20 rows of {} ".format(csv_output))
    df.show()

    print("Count rows of {}: {} ".format(csv_output, df.count()))
    
    print("Check unique value of {}: {} ".format(csv_output, df.select(valid_code).distinct().count()))

    print("Staging csv files in: {}".format(csv_output))

    return df

### Extract and clean validation values from 'SAS_Labels_Descriptions.SAS'

In [None]:
# `I94RES` label
I94RES_staging_df = extract_staging_sas_label('I94RES')
I94RES_staging_df = I94RES_staging_df.toPandas()
I94RES_staging_df

In [None]:
# `I94PORT` label
I94PORT_staging_df = extract_staging_sas_label('I94PORT')
I94PORT_staging_df = I94PORT_staging_df.toPandas()
I94PORT_staging_df

In [None]:
# `I94ADDR` label
I94ADDR_staging_df = extract_staging_sas_label('I94ADDR')
I94ADDR_staging_df = I94ADDR_staging_df.toPandas()
I94ADDR_staging_df

In [None]:
# `I94MODE` label
I94MODE_staging_df = extract_staging_sas_label('I94MODE')
I94MODE_staging_df = I94MODE_staging_df.toPandas()
I94MODE_staging_df

In [None]:
# `I94VISA` label
I94VISA_staging_df = extract_staging_sas_label('I94VISA')
I94VISA_staging_df = I94VISA_staging_df.toPandas()
I94VISA_staging_df

### Clean validation values from 'SAS_Labels_Descriptions.SAS'

Split to ***I94PORT_code, City, State*** for ***dim_i94port***

In [None]:
# Convert to pandas dataframe
I94PORT_staging_df = I94PORT_staging_df.toPandas()
I94PORT_staging_df

In [None]:
# clean leading and trailing white space
I94PORT_staging_df["I94PORT_valid_code"] = I94PORT_staging_df["I94PORT_valid_code"].str.lstrip().str.rstrip()
I94PORT_staging_df["I94PORT_valid_value"] = I94PORT_staging_df["I94PORT_valid_value"].str.lstrip().str.rstrip()

In [None]:
# split to port, city, state
I94PORT_staging_df["I94PORT_city_name"] = I94PORT_staging_df["I94PORT_valid_value"].str.split(",").str.get(0)
I94PORT_staging_df["I94PORT_state_code"] = I94PORT_staging_df["I94PORT_valid_value"].str.split(",").str.get(1)

In [None]:
# clean leading and trailing white space
I94PORT_pandas_df["I94PORT_city_name"] = I94PORT_staging_df["I94PORT_city_name"].str.lstrip().str.rstrip()
I94PORT_pandas_df["I94PORT_state_code"] = I94PORT_staging_df["I94PORT_state_code"].str.lstrip().str.rstrip()

In [None]:
I94PORT_staging_df

In [None]:
# count amount airport of a city
I94PORT_staging_df["I94PORT_city_name"].str.title().value_counts()

In [None]:
# drop missing value on I94PORT_state_code
I94PORT_staging_df = I94PORT_staging_df.dropna(subset = ["I94PORT_state_code"])
I94PORT_staging_df

### Staging validation values from 'SAS_Labels_Descriptions.SAS'

In [None]:
# Staging `I94PORT` label
I94PORT_staging_df.to_csv("i94port_staging.csv", index = False, columns = ["I94PORT_valid_code", "I94PORT_city_name", "I94PORT_state_code"], encoding = "utf-8")