In [29]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [30]:
config = configparser.ConfigParser()
config.read('etl.cfg')

input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')

## I94 Immigration data cleaning and staging

For i94 immigration format, we use spark.sql to cleaning and staging this dataset

In [31]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [32]:
spark

In [5]:
i94immi_dataset

'../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'

In [33]:
# Using for SAS format production
# i94immi_dataset = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
# i94immi_df = spark.read.format('com.github.saurfang.sas.spark').load(i94immi_dataset)

# Using for local development
# i94immi_dataset = 'immigration_data_sample.csv'
i94immi_dataset = 'i94_apr16_sub.sas7bdat'
i94immi_df = spark.read.format('com.github.saurfang.sas.spark').load(i94immi_dataset)
# i94immi_df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(i94immi_dataset)
# i94immi_df = pd.read_csv(i94immi_dataset,sep=",")

Py4JJavaError: An error occurred while calling o82.load.
: java.lang.NoClassDefFoundError: scala/Product$class
	at com.github.saurfang.sas.spark.SasRelation.<init>(SasRelation.scala:48)
	at com.github.saurfang.sas.spark.SasRelation$.apply(SasRelation.scala:42)
	at com.github.saurfang.sas.spark.DefaultSource.createRelation(DefaultSource.scala:50)
	at com.github.saurfang.sas.spark.DefaultSource.createRelation(DefaultSource.scala:39)
	at com.github.saurfang.sas.spark.DefaultSource.createRelation(DefaultSource.scala:27)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)


In [15]:
i94immi_df.count()

1000

In [17]:
i94immi_df.show()

+-------+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|    _c0|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|         admnum|fltno|visatype|
+-------+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|2027561|4084316.0|2016.0|   4.0| 209.0| 209.0|    HHW|20566.0|    1.0|     HI|20573.0|  61.0|    2.0|  1.0|20160422|    null| null|      G|      O|   null|      M| 1955.0|07202016|     F|  null|     JL|5.6582674633E10|00782|      WT|
|2171295|4422636.0|2016.0|   4.0| 582.0| 582.0|    MCA|20567

Create a sql table view of i94 immigration dataset

In [18]:
i94immi_df.createOrReplaceTempView('i94immi_table')

### Choose Primarykey

If distinct result of '**cicid**' the same to record amount of dataset. We can use **'cicid'** as primarykey.

In [19]:
spark.sql("""
    SELECT COUNT (DISTINCT cicid)
    FROM i94immi_table
""").show()

+---------------------+
|count(DISTINCT cicid)|
+---------------------+
|                 1000|
+---------------------+



### Cleaning Arrival date and Departure date

We verify the logic of data, Departure date must be greater or equal Arrival date because:
- Columns **'arrdate'** displays the arrival date in the USA 
- Column **'depdate'** as departure date from the USA. 

We count amount of records with Departure date <= Arrival date. These are un-makesence data will be droped

In [20]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



Show samples of un-makesence data

In [21]:
spark.sql("""
    SELECT arrdate, depdate
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show()

+-------+-------+
|arrdate|depdate|
+-------+-------+
+-------+-------+



We drop un-makesence logic values from i94immi dataset

In [22]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE arrdate <= depdate
""").createOrReplaceTempView("i94immi_table")

Verify cleaned arrdate and depdate

In [23]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
|     951|
+--------+



In [26]:
i94immi_df.show()

+-------+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|    _c0|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|         admnum|fltno|visatype|
+-------+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|2027561|4084316.0|2016.0|   4.0| 209.0| 209.0|    HHW|20566.0|    1.0|     HI|20573.0|  61.0|    2.0|  1.0|20160422|    null| null|      G|      O|   null|      M| 1955.0|07202016|     F|  null|     JL|5.6582674633E10|00782|      WT|
|2171295|4422636.0|2016.0|   4.0| 582.0| 582.0|    MCA|20567

We add column **'arrival_date'** from **'arrdate'** base on SAS correspond timestone *1960-01-01*

In [28]:
df_date = spark.sql("""
    SELECT *, date_add(to_date('1960-01-01'), arrdate) AS arrival_date 
    FROM i94immi_table
""")

AnalysisException: cannot resolve 'date_add(to_date('1960-01-01'), i94immi_table.arrdate)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'i94immi_table.arrdate' is of double type.; line 2 pos 14;
'Project [_c0#92, cicid#93, i94yr#94, i94mon#95, i94cit#96, i94res#97, i94port#98, arrdate#99, i94mode#100, i94addr#101, depdate#102, i94bir#103, i94visa#104, count#105, dtadfile#106, visapost#107, occup#108, entdepa#109, entdepd#110, entdepu#111, matflag#112, biryear#113, dtaddto#114, gender#115, ... 6 more fields]
+- SubqueryAlias i94immi_table
   +- View (`i94immi_table`, [_c0#92,cicid#93,i94yr#94,i94mon#95,i94cit#96,i94res#97,i94port#98,arrdate#99,i94mode#100,i94addr#101,depdate#102,i94bir#103,i94visa#104,count#105,dtadfile#106,visapost#107,occup#108,entdepa#109,entdepd#110,entdepu#111,matflag#112,biryear#113,dtaddto#114,gender#115,insnum#116,airline#117,admnum#118,fltno#119,visatype#120])
      +- Project [_c0#92, cicid#93, i94yr#94, i94mon#95, i94cit#96, i94res#97, i94port#98, arrdate#99, i94mode#100, i94addr#101, depdate#102, i94bir#103, i94visa#104, count#105, dtadfile#106, visapost#107, occup#108, entdepa#109, entdepd#110, entdepu#111, matflag#112, biryear#113, dtaddto#114, gender#115, ... 5 more fields]
         +- Filter (arrdate#99 <= depdate#102)
            +- SubqueryAlias i94immi_table
               +- View (`i94immi_table`, [_c0#92,cicid#93,i94yr#94,i94mon#95,i94cit#96,i94res#97,i94port#98,arrdate#99,i94mode#100,i94addr#101,depdate#102,i94bir#103,i94visa#104,count#105,dtadfile#106,visapost#107,occup#108,entdepa#109,entdepd#110,entdepu#111,matflag#112,biryear#113,dtaddto#114,gender#115,insnum#116,airline#117,admnum#118,fltno#119,visatype#120])
                  +- Relation [_c0#92,cicid#93,i94yr#94,i94mon#95,i94cit#96,i94res#97,i94port#98,arrdate#99,i94mode#100,i94addr#101,depdate#102,i94bir#103,i94visa#104,count#105,dtadfile#106,visapost#107,occup#108,entdepa#109,entdepd#110,entdepu#111,matflag#112,biryear#113,dtaddto#114,gender#115,... 5 more fields] csv


In [14]:
df_date.createOrReplaceTempView("i94immi_table")

In [15]:
spark.sql("""
    SELECT COUNT(arrival_date)
    FROM i94immi_table
""").show()

+-------------------+
|count(arrival_date)|
+-------------------+
|            2953481|
+-------------------+



And then add column **'departure_date'** column from **'depdate'** base on SAS correspond timestone *1960-01-01*

In [16]:
spark.sql("""SELECT *, CASE 
                        WHEN depdate >= arrdate THEN date_add(to_date('1960-01-01'), depdate)
                        WHEN depdate IS NULL THEN NULL
                        ELSE 'N/A' END AS departure_date 
                FROM i94immi_table
            """).createOrReplaceTempView("i94immi_table")

Verify column **'departure_date'**

In [18]:
spark.sql("""
    SELECT COUNT(departure_date)
    FROM i94immi_table
""").show()

+---------------------+
|count(departure_date)|
+---------------------+
|              2953481|
+---------------------+



Count distinct **'arrival_date'**

In [19]:
spark.sql("""SELECT MIN(arrival_date) as min_arrival_date, MAX(arrival_date) as max_arrival_date
            FROM i94immi_table
    """).show()

+----------------+----------------+
|min_arrival_date|max_arrival_date|
+----------------+----------------+
|      2016-04-01|      2016-04-30|
+----------------+----------------+



In [21]:
spark.sql("""SELECT COUNT (DISTINCT arrival_date) as distinct_arrival_date
            FROM i94immi_table
    """).show()

+---------------------+
|distinct_arrival_date|
+---------------------+
|                   30|
+---------------------+



Count distinct **'departure_date'**

In [20]:
spark.sql("""SELECT MIN(departure_date) as min_departure_date, MAX(departure_date) as max_departure_date
            FROM i94immi_table
    """).show()

+------------------+------------------+
|min_departure_date|max_departure_date|
+------------------+------------------+
|        2016-04-02|        2084-05-16|
+------------------+------------------+



In [22]:
spark.sql("""SELECT COUNT (DISTINCT departure_date) as distinct_departure_date
            FROM i94immi_table
    """).show()

+-----------------------+
|distinct_departure_date|
+-----------------------+
|                    174|
+-----------------------+



Count distinct date between **'arrival_date'** and **'departure_date'**

In [23]:
spark.sql("""SELECT COUNT(DISTINCT departure_date) as distinct_date_between
            FROM i94immi_table 
            WHERE departure_date IN (SELECT DISTINCT arrival_date 
                                    FROM i94immi_table
                                ) 
        """).show()

+---------------------+
|distinct_date_between|
+---------------------+
|                   29|
+---------------------+



Just one missing value (29/30) --> our dim_date tables will include **'arrival_date'** and **'departure_date'**

Find any NaN or Null values on 'departure_date'

In [24]:
spark.sql("SELECT count(*) FROM i94immi_table WHERE departure_date = 'N/A'").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [25]:
spark.sql("SELECT count(*) FROM i94immi_table WHERE departure_date = 'NULL'").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



Verify again

In [133]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



### Cleaning i94port

We check the column **'i94port'** and note value length of this column

In [134]:
spark.sql("""
    SELECT i94port
    FROM i94immi_table
""").show()

+-------+
|i94port|
+-------+
|    WAS|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    TOR|
|    BOS|
|    ATL|
|    ATL|
|    ATL|
|    ATL|
|    HOU|
|    NYC|
|    NYC|
|    NYC|
|    MIA|
+-------+
only showing top 20 rows



In [28]:
spark.sql("""
    SELECT count(*) 
    FROM i94immi_table 
    WHERE i94port == 'NaN'
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [27]:
spark.sql("""
    SELECT count(*) 
    FROM i94immi_table 
    WHERE i94port is NULL
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



Values length of column **'i94port'** is 3 charaters, this length 

### Cleaning i94mode

Next, we take a look on arival mode as column **i94mode**

In [135]:
spark.sql("""
    SELECT i94mode, count(*)
    FROM i94immi_table
    GROUP BY i94mode
""").show()

+-------+--------+
|i94mode|count(1)|
+-------+--------+
|   null|     238|
|    1.0| 2871184|
|    3.0|   61572|
|    2.0|   17970|
|    9.0|    2517|
+-------+--------+



From I94_SAS_Labels_Descriptions_SAS we extracted **i94_mode.csv** includes info:

We keep air arrival only (**i94mode=1**), drop any arrival values else (null,, 2, 3, 9)

In [136]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94mode == 1.0
""").createOrReplaceTempView("i94immi_table")

Verify our table

In [137]:
spark.sql("""
    SELECT COUNT(*) as number_of_records
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2871184|
+--------+



### Cleaning i94visa, visatype

From **SAS_Labels**, we extracted visatype validation as **i94_visa.csv**. 

This step, we mapping **i94visa** numbers to **visatype** instead.

In [138]:
spark.sql("""
        SELECT *, CASE 
                    WHEN i94visa = 1.0 THEN 'Business' 
                    WHEN i94visa = 2.0 THEN 'Pleasure'
                    WHEN i94visa = 3.0 THEN 'Student'
                    ELSE 'N/A' END AS visa_type
        FROM i94immi_table
    """).createOrReplaceTempView("i94immi_table")

We category **visatype** grouping by **visa_type**

In [141]:
spark.sql("""
        SELECT visa_type as visa_type, visatype as visatype_code, count(*) as count_by_visa_category
        FROM i94immi_table
        GROUP BY visa_type, visatype
        ORDER BY visa_type, visatype
""").show()

+---------+--------+--------+
|visa_type|visatype|count(1)|
+---------+--------+--------+
| Business|      B1|  201741|
| Business|      E1|    3027|
| Business|      E2|   15157|
| Business|     GMB|     132|
| Business|       I|    2931|
| Business|      I1|     211|
| Business|      WB|  277414|
| Pleasure|      B2| 1008434|
| Pleasure|      CP|   11891|
| Pleasure|     CPL|       8|
| Pleasure|     GMT|   79777|
| Pleasure|     SBP|       2|
| Pleasure|      WT| 1243531|
|  Student|      F1|   24599|
|  Student|      F2|    1622|
|  Student|      M1|     679|
|  Student|      M2|      28|
+---------+--------+--------+



Find any NaN or Null values on 'visatype'

In [142]:
spark.sql("""
    SELECT count(*) as count_null_of_visatype
    FROM i94immi_table 
    WHERE visatype is NULL
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [143]:
spark.sql("""
    SELECT count(*) as count_missing_values
    FROM i94immi_table 
    WHERE visatype == 'NaN'
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



### Cleaning i94bir and biryear

Check check whether value is NULL or not

In [31]:
spark.sql("""
    SELECT count(*) as count_NULL_values
    FROM i94immi_table 
    WHERE i94bir is NULL
""").show()

+--------+
|count(1)|
+--------+
|      46|
+--------+



In [33]:
spark.sql("""
    SELECT count(*) as count_NULL_values
    FROM i94immi_table 
    WHERE biryear is NULL
""").show()

+--------+
|count(1)|
+--------+
|      46|
+--------+



In [34]:
spark.sql("""
    SELECT MAX(biryear) as max_biryear, MIN(biryear) as min_biryear
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
""").show()

+-----------+-----------+
|max_biryear|min_biryear|
+-----------+-----------+
|     2016.0|     1916.0|
+-----------+-----------+



Take a look on travel velocity for 90 years old and older

In [37]:
spark.sql("""
    SELECT biryear as birth_year, COUNT(*) as count_by_birth_year
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
    AND biryear <= 1926
    GROUP BY biryear
    ORDER BY biryear ASC
""").show()

+-------+--------+
|biryear|count(1)|
+-------+--------+
| 1916.0|       8|
| 1917.0|      16|
| 1918.0|      21|
| 1919.0|      36|
| 1920.0|      34|
| 1921.0|      69|
| 1922.0|      89|
| 1923.0|     155|
| 1924.0|     209|
| 1925.0|     274|
| 1926.0|     414|
+-------+--------+



Total of travel trips for 90 year old and older not so much. Don't worry of this

### Cleaning gender

We just user records of male = 'M' and female = 'F'

In [40]:
spark.sql("""
    SELECT * 
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").createOrReplaceTempView("i94immi_table")

In [44]:
spark.sql("""
    SELECT COUNT(gender) as gender_count
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").show()

+------------+
|gender_count|
+------------+
|     2544951|
+------------+



In [45]:
spark.sql("""
    SELECT gender as gender, COUNT(*) as count_gender
    FROM i94immi_table 
    WHERE gender IN ('F') OR gender IN ('M')
    GROUP BY gender
    ORDER BY gender ASC
""").show()

+------+------------+
|gender|count_gender|
+------+------------+
|     F|     1228646|
|     M|     1316305|
+------+------------+



### Cleaning i94cit (citizenship), i94res(residence) and i94addr (state)

Columns **'i94cit'**, **'i94res'** and **'i94addr'** are in float datatype with meaning:
- i94cit: Country of citizenship
- i94res: Country of residence
- i94addr: State code validation

Just check and drop NULL values if need for these columns

In [50]:
spark.sql("""
    SELECT count(*) as count_null_i94cit
    FROM i94immi_table
    WHERE i94cit IS NULL
""").show()

+-----------------+
|count_null_i94cit|
+-----------------+
|                0|
+-----------------+



In [51]:
spark.sql("""
    SELECT count(*) as count_null_i94res
    FROM i94immi_table
    WHERE i94res IS NULL
""").show()

+-----------------+
|count_null_i94res|
+-----------------+
|                0|
+-----------------+



In [49]:
spark.sql("""
    SELECT count(*) as count_null_i94addr
    FROM i94immi_table
    WHERE i94addr IS NULL
""").show()

+------------------+
|count_null_i94addr|
+------------------+
|            114019|
+------------------+



Do not use i94addr cause of a lot NULL values in this column

### Baseline i94immi_table of data i94 immigration

In [57]:
i94immi_df = spark.sql("""
                        SELECT *
                        FROM i94immi_table
                    """)

In [61]:
i94immi_df.head()

Row(cicid=15.0, i94yr=2016.0, i94mon=4.0, i94cit=101.0, i94res=101.0, i94port='WAS', arrdate=20545.0, i94mode=1.0, i94addr='MI', depdate=20691.0, i94bir=55.0, i94visa=2.0, count=1.0, dtadfile='20160401', visapost=None, occup=None, entdepa='T', entdepd='O', entdepu=None, matflag='M', biryear=1961.0, dtaddto='09302016', gender='M', insnum=None, airline='OS', admnum=666643185.0, fltno='93', visatype='B2', arrival_date=datetime.date(2016, 4, 1), departure_date='2016-08-25')

In [67]:
#i94immi_df.write.csv("i94immi_df_clean.csv")
i94immi_df.write.options(header='True', delimiter=',').csv("i94immi_df_clean")
#i94immi_df.write.mode('overwrite').csv("i94immi_df_clean")