In [69]:
# Do all imports and installs here
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pathlib import Path

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.\
config("spark.jars.repositories", "https://repos.spark-packages.org/").\
config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11").\
enableHiveSupport().getOrCreate()

df_spark = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')


In [70]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

In [4]:
# Read in the data here
i94immi_df=spark.read.parquet("sas_data")

In [6]:
df_spark.show(3)

+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|       admnum|fltno|visatype|
+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|  6.0|2016.0|   4.0| 692.0| 692.0|    XXX|20573.0|   null|   null|   null|  37.0|    2.0|  1.0|    null|    null| null|      T|   null|      U|   null| 1979.0|10282016|  null|  null|   null|1.897628485E9| null|      B2|
|  7.0|2016.0|   4.0| 254.0| 276.0|    ATL|20551.0|    1.0|     AL|   null|  25.0|    3.0|  1.0|20130811|     SEO| n

In [7]:
df_spark.count()

3096313

In [8]:
i94immi_df.createOrReplaceTempView('i94immi_table')

### Choose Primarykey

If distinct result of `cicid` the same to record amount of dataset. We can use `cicid` as primarykey.

In [9]:
spark.sql("""
    SELECT COUNT (DISTINCT cicid)
    FROM i94immi_table
""").show()

+---------------------+
|count(DISTINCT cicid)|
+---------------------+
|              3096313|
+---------------------+



### Cleaning Arrival date and Departure date

We verify the logic of data, Departure date must be greater or equal Arrival date because:
- Columns `arrdate` displays the arrival date in the USA 
- Column `depdate` as departure date from the USA. 

We count amount of records with `DepartureDate >= ArrivalDate`. These are un-makesence data will be droped

In [10]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show()

+--------+
|count(1)|
+--------+
|     375|
+--------+



In [11]:
spark.sql("""
    SELECT arrdate, depdate
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show(5)

+-------+-------+
|arrdate|depdate|
+-------+-------+
|20574.0|20573.0|
|20574.0|20572.0|
|20574.0|20573.0|
|20549.0|19097.0|
|20549.0|20527.0|
+-------+-------+
only showing top 5 rows



In [12]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE arrdate <= depdate
""").createOrReplaceTempView("i94immi_table")

Verify cleaned `arrdate` and `depdate`

In [13]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



Add column `arrival_date = timestone + arrdate_offset_day`, with:
- timestone = '1960-01-01' (***datetime*** datatype)
- arrdate_offset_day = 'arrdate' (***integer*** datatype)
- arrival_date (***datetime*** datatype)

In [14]:
spark.sql("""
    SELECT *, date_add(to_date('1960-01-01'), arrdate) AS arrival_date 
    FROM i94immi_table
""").createOrReplaceTempView("i94immi_table")

In [15]:
spark.sql("""
    SELECT COUNT(arrival_date)
    FROM i94immi_table
""").show()

+-------------------+
|count(arrival_date)|
+-------------------+
|            2953481|
+-------------------+



In [16]:
spark.sql("""
    SELECT arrival_date
    FROM i94immi_table
""").show(5)

+------------+
|arrival_date|
+------------+
|  2016-04-30|
|  2016-04-30|
|  2016-04-30|
|  2016-04-30|
|  2016-04-30|
+------------+
only showing top 5 rows



Add column `departure_date = timestone + depdate_offset_day`, with:
- `timestone` = '1960-01-01' (***datetime*** datatype)
- `depdate_offset_day` = 'depdate' (***integer*** datatype)
- `departure_date` (***datetime*** datatype)

In [17]:
spark.sql("""SELECT *, CASE 
                        WHEN depdate >= arrdate THEN date_add(to_date('1960-01-01'), depdate)
                        WHEN depdate IS NULL THEN NULL
                        ELSE 'NaN' END AS departure_date 
                FROM i94immi_table
            """).createOrReplaceTempView("i94immi_table")

In [18]:
spark.sql("""
    SELECT departure_date
    FROM i94immi_table
""").show(5)

+--------------+
|departure_date|
+--------------+
|    2016-05-08|
|    2016-05-17|
|    2016-05-08|
|    2016-05-14|
|    2016-05-14|
+--------------+
only showing top 5 rows



In [19]:
spark.sql("""
    SELECT COUNT(departure_date)
    FROM i94immi_table
""").show()

+---------------------+
|count(departure_date)|
+---------------------+
|              2953481|
+---------------------+



Range of time `arrival_date`

In [20]:
spark.sql("""SELECT MIN(arrival_date) as min_arrival_date, MAX(arrival_date) as max_arrival_date
            FROM i94immi_table
    """).show()

+----------------+----------------+
|min_arrival_date|max_arrival_date|
+----------------+----------------+
|      2016-04-01|      2016-04-30|
+----------------+----------------+



In [21]:
spark.sql("""
    SELECT COUNT(*) as count_null_arrival_date
    FROM i94immi_table
    WHERE arrival_date is NULL
""").show()

+-----------------------+
|count_null_arrival_date|
+-----------------------+
|                      0|
+-----------------------+



Range of time `departure_date`

In [22]:
spark.sql("""SELECT MIN(departure_date) as min_departure_date, MAX(departure_date) as max_departure_date
            FROM i94immi_table
    """).show()

+------------------+------------------+
|min_departure_date|max_departure_date|
+------------------+------------------+
|        2016-04-02|        2084-05-16|
+------------------+------------------+



Count wrong departure date `departure_date < '2016-04-01'` or `departure_date > '2018-12-31'`

In [23]:
spark.sql("""
    SELECT COUNT(departure_date)
    FROM i94immi_table
    WHERE departure_date < '2016-04-01' OR departure_date > '2018-12-31'
""").show()

+---------------------+
|count(departure_date)|
+---------------------+
|                    3|
+---------------------+



Just 3 wrong departure date values. No need to do with these.

Find any NaN or Null values on `departure_date`

In [24]:
spark.sql("""
    SELECT COUNT(*) as count_null_departure_date
    FROM i94immi_table
    WHERE departure_date is NULL
""").show()

+-------------------------+
|count_null_departure_date|
+-------------------------+
|                        0|
+-------------------------+



Verify again

In [25]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



In [26]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").createOrReplaceTempView("i94immi_table")

### Cleaning i94port

Check the column `i94port` and note value length of this column

In [27]:
spark.sql("""
    SELECT i94port
    FROM i94immi_table
""").show(3)

+-------+
|i94port|
+-------+
|    LOS|
|    LOS|
|    LOS|
+-------+
only showing top 3 rows



In [28]:
spark.sql("""
    SELECT count(*) as count_null_i94port
    FROM i94immi_table 
    WHERE i94port is NULL
""").show()

+------------------+
|count_null_i94port|
+------------------+
|                 0|
+------------------+



### Cleaning i94mode

Next, we take a look on arival mode as column `i94mode`

In [29]:
spark.sql("""
    SELECT i94mode, count(*) as count_by_i94mode
    FROM i94immi_table
    GROUP BY i94mode
""").show()

+-------+----------------+
|i94mode|count_by_i94mode|
+-------+----------------+
|   null|             238|
|    1.0|         2871184|
|    3.0|           61572|
|    2.0|           17970|
|    9.0|            2517|
+-------+----------------+



From `I94_SAS_Labels_Descriptions_SAS` we extracted `i94mode_sas_label_validation.csv` includes info:

We keep air arrival only, mean `i94mode=1`, drop any arrival values else (null,, 2, 3, 9)

In [30]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94mode == 1.0
""").createOrReplaceTempView("i94immi_table")

In [31]:
spark.sql("""
    SELECT i94mode, count(*) as count_by_i94mode
    FROM i94immi_table
    GROUP BY i94mode
""").show()

+-------+----------------+
|i94mode|count_by_i94mode|
+-------+----------------+
|    1.0|         2871184|
+-------+----------------+



Verify our table

In [32]:
spark.sql("""
    SELECT COUNT(*) as number_of_records
    FROM i94immi_table
""").show()

+-----------------+
|number_of_records|
+-----------------+
|          2871184|
+-----------------+



### Cleaning i94visa, visatype

From `I94_SAS_Labels_Descriptions_SAS` we extracted `i94visa_sas_label_validation.csv` includes info:

This step, we mapping `i94visa` numbers to `visatype` instead.

In [33]:
spark.sql("""
        SELECT *, CASE 
                    WHEN i94visa = 1.0 THEN 'Business' 
                    WHEN i94visa = 2.0 THEN 'Pleasure'
                    WHEN i94visa = 3.0 THEN 'Student'
                    ELSE 'NaN' END AS visa_type
        FROM i94immi_table
    """).createOrReplaceTempView("i94immi_table")

In [35]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).show(3)

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|visa_type|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E1

We category `visatype` grouping by `visa_type`

In [36]:
spark.sql("""
        SELECT visa_type as visa_type, visatype as visatype_code, count(*) as count_by_visa_category
        FROM i94immi_table
        GROUP BY visa_type, visatype
        ORDER BY visa_type, visatype
""").show(5)

+---------+-------------+----------------------+
|visa_type|visatype_code|count_by_visa_category|
+---------+-------------+----------------------+
| Business|           B1|                201741|
| Business|           E1|                  3027|
| Business|           E2|                 15157|
| Business|          GMB|                   132|
| Business|            I|                  2931|
+---------+-------------+----------------------+
only showing top 5 rows



Find any *NaN* or *NULL* values on `visatype`

In [37]:
spark.sql("""
    SELECT count(*) as count_null_of_visatype
    FROM i94immi_table 
    WHERE visatype is NULL
""").show()

+----------------------+
|count_null_of_visatype|
+----------------------+
|                     0|
+----------------------+



In [38]:
spark.sql("""
    SELECT count(*) as count_missing_values
    FROM i94immi_table 
    WHERE visatype == 'NaN'
""").show()

+--------------------+
|count_missing_values|
+--------------------+
|                   0|
+--------------------+



In [39]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).show(3)

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|visa_type|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E1

In [40]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).createOrReplaceTempView("i94immi_table")

### Cleaning i94bir and biryear

Check check whether value is *NULL* or not

In [41]:
spark.sql("""
    SELECT count(*) as count_NULL_values
    FROM i94immi_table 
    WHERE i94bir is NULL OR biryear is NULL
""").show()

+-----------------+
|count_NULL_values|
+-----------------+
|               42|
+-----------------+



In [42]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94bir is not NULL OR biryear is not NULL
""").createOrReplaceTempView("i94immi_table")

In [43]:
spark.sql("""
    SELECT MAX(biryear) as max_biryear, MIN(biryear) as min_biryear
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
""").show()

+-----------+-----------+
|max_biryear|min_biryear|
+-----------+-----------+
|     2016.0|     1916.0|
+-----------+-----------+



Take a look on travel velocity for _**>= 90 years old traveller**_

In [44]:
spark.sql("""
    SELECT biryear as birth_year, COUNT(*) as count_by_birth_year
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
    AND biryear <= 1926
    GROUP BY biryear
    ORDER BY biryear ASC
""").show()

+----------+-------------------+
|birth_year|count_by_birth_year|
+----------+-------------------+
|    1916.0|                  7|
|    1917.0|                 16|
|    1918.0|                 21|
|    1919.0|                 35|
|    1920.0|                 34|
|    1921.0|                 66|
|    1922.0|                 86|
|    1923.0|                154|
|    1924.0|                199|
|    1925.0|                262|
|    1926.0|                396|
+----------+-------------------+



Total of travel trips for _**>= 90 year old travaller**_ not much. Don't worry of this kind of travaller age range.

### Cleaning gender

We just user records of `male = 'M'` and `female = 'F'`

In [45]:
spark.sql("""
    SELECT * 
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").createOrReplaceTempView("i94immi_table")

In [46]:
spark.sql("""
    SELECT COUNT(gender) as gender_count
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").show()

+------------+
|gender_count|
+------------+
|     2465314|
+------------+



In [47]:
spark.sql("""
    SELECT gender as gender, COUNT(*) as count_gender
    FROM i94immi_table 
    WHERE gender IN ('F') OR gender IN ('M')
    GROUP BY gender
    ORDER BY gender ASC
""").show()

+------+------------+
|gender|count_gender|
+------+------------+
|     F|     1190428|
|     M|     1274886|
+------+------------+



### Cleaning `i94cit` - citizenship, `i94res` - residence and `i94addr` - state

Columns `i94cit` `i94res` and `i94addr` are in float datatype with meaning:
- `i94cit`: Country of citizenship
- `i94res`: Country of residence
- `i94addr`: State code validation

Just check and drop *NULL* values if need for these columns

In [48]:
spark.sql("""
    SELECT count(*) as count_null_i94cit
    FROM i94immi_table
    WHERE i94cit IS NULL
""").show()

+-----------------+
|count_null_i94cit|
+-----------------+
|                0|
+-----------------+



In [49]:
spark.sql("""
    SELECT count(*) as count_null_i94res
    FROM i94immi_table
    WHERE i94res IS NULL
""").show()

+-----------------+
|count_null_i94res|
+-----------------+
|                0|
+-----------------+



In [50]:
spark.sql("""
    SELECT count(*) as count_null_i94addr
    FROM i94immi_table
    WHERE i94addr IS NULL
""").show()

+------------------+
|count_null_i94addr|
+------------------+
|             87437|
+------------------+



In [51]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94addr IS NOT NULL
""").createOrReplaceTempView("i94immi_table")

### Baseline `i94immi_table` of I94 IMMIGRATION dataset

In [52]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).show(5)

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|visa_type|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E1

### Select columns that using for staging dataframe

In [53]:
spark.sql("""
            SELECT cicid, i94yr, i94mon, arrival_date, i94res, i94port, arrdate, i94addr, departure_date
            FROM i94immi_table
        """).createOrReplaceTempView("i94immi_table")

In [54]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").show(5)

+---------+------+------+------------+------+-------+-------+-------+--------------+
|    cicid| i94yr|i94mon|arrival_date|i94res|i94port|arrdate|i94addr|departure_date|
+---------+------+------+------------+------+-------+-------+-------+--------------+
|5748517.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     CA|    2016-05-08|
|5748518.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     NV|    2016-05-17|
|5748519.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-08|
|5748520.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
|5748521.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
+---------+------+------+------------+------+-------+-------+-------+--------------+
only showing top 5 rows



In [55]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").count()

2377877

In [56]:
i94immi_df = spark.sql("""
    SELECT *
    FROM i94immi_table
""")

In [57]:
i94immi_df.show(5)

+---------+------+------+------------+------+-------+-------+-------+--------------+
|    cicid| i94yr|i94mon|arrival_date|i94res|i94port|arrdate|i94addr|departure_date|
+---------+------+------+------------+------+-------+-------+-------+--------------+
|5748517.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     CA|    2016-05-08|
|5748518.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     NV|    2016-05-17|
|5748519.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-08|
|5748520.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
|5748521.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
+---------+------+------+------------+------+-------+-------+-------+--------------+
only showing top 5 rows



In [None]:
#write to parquet
i94immi_df.write.parquet("sas_data")

In [71]:
rmdir(Path("i94immi_df_clean"))

In [None]:
i94immi_df.write.options(header='True', delimiter=',').csv("i94immi_df_clean")

In [None]:
# Performing cleaning tasks here





In [None]:
# Write code here

In [None]:
# Perform quality checks here