In [133]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [134]:
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [135]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

## I94 Immigration data cleaning and staging

For _**i94 immigration**_ format, we use ***spark.sql*** to cleaning and staging this dataset

In [136]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [152]:
i94immi_dataset

'../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'

In [153]:
# Using for production
# i94immi_dataset = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
i94immi_df = spark.read.format('com.github.saurfang.sas.spark').load(i94immi_dataset)

# Using for local development
# i94immi_dataset = 'immigration_data_sample.csv'
# i94immi_df = pd.read_csv(i94immi_dataset,sep=",")

In [155]:
# rmdir(Path("i94immi_df_clean"))
# i94immi_df.write.csv("i94immi_df_clean.csv")
i94immi_df.write.options(header='True', delimiter=',').csv("i94immi_df_clean")
# i94immi_df.write.mode('overwrite').csv("i94immi_df_clean")
# i94immi_df.coalesce(1).write.csv("./i94immi_df_clean.csv")

In [157]:
i94immi_df = spark.read.options(inferSchema="True", delimiter=",", header = "True").csv("i94immi_df_clean")
#i94immi_df = spark.read.format("csv").load('./i94immi_df_clean.csv/*.csv')

In [158]:
i94immi_df.show()

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E10|00011|      B1|
|5748518.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     NV|20591.0|  32.0|    1.0|  

In [159]:
i94immi_df.columns

['cicid',
 'i94yr',
 'i94mon',
 'i94cit',
 'i94res',
 'i94port',
 'arrdate',
 'i94mode',
 'i94addr',
 'depdate',
 'i94bir',
 'i94visa',
 'count',
 'dtadfile',
 'visapost',
 'occup',
 'entdepa',
 'entdepd',
 'entdepu',
 'matflag',
 'biryear',
 'dtaddto',
 'gender',
 'insnum',
 'airline',
 'admnum',
 'fltno',
 'visatype']

In [160]:
i94immi_df = i94immi_df.select('cicid',
 'i94yr',
 'i94mon',
 'i94res',
 'i94port',
 'arrdate',
 'i94addr',
 'depdate').show()

+---------+------+------+------+-------+-------+-------+-------+
|    cicid| i94yr|i94mon|i94res|i94port|arrdate|i94addr|depdate|
+---------+------+------+------+-------+-------+-------+-------+
|5748517.0|2016.0|   4.0| 438.0|    LOS|20574.0|     CA|20582.0|
|5748518.0|2016.0|   4.0| 438.0|    LOS|20574.0|     NV|20591.0|
|5748519.0|2016.0|   4.0| 438.0|    LOS|20574.0|     WA|20582.0|
|5748520.0|2016.0|   4.0| 438.0|    LOS|20574.0|     WA|20588.0|
|5748521.0|2016.0|   4.0| 438.0|    LOS|20574.0|     WA|20588.0|
|5748522.0|2016.0|   4.0| 464.0|    HHW|20574.0|     HI|20579.0|
|5748523.0|2016.0|   4.0| 464.0|    HHW|20574.0|     HI|20586.0|
|5748524.0|2016.0|   4.0| 464.0|    HHW|20574.0|     HI|20586.0|
|5748525.0|2016.0|   4.0| 464.0|    HOU|20574.0|     FL|20581.0|
|5748526.0|2016.0|   4.0| 464.0|    LOS|20574.0|     CA|20581.0|
|5748527.0|2016.0|   4.0| 504.0|    NEW|20574.0|     MA|20576.0|
|5748528.0|2016.0|   4.0| 504.0|    LOS|20574.0|   null|20575.0|
|5748529.0|2016.0|   4.0|

In [161]:
# Convert to pandas dataframe
i94immi_df.toPandas()

AttributeError: 'NoneType' object has no attribute 'toPandas'