In [2]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [3]:
config = configparser.ConfigParser()
config.read('etl.cfg')

input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')

## I94 Immigration data cleaning and staging

For i94 immigration format, we use spark.sql to cleaning and staging this dataset

In [4]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [5]:
i94immi_dataset

'./i94_apr16_sub.sas7bdat'

In [6]:
# Using for production
# i94immi_dataset = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
i94immi_df = spark.read.format('com.github.saurfang.sas.spark').load(i94immi_dataset)

# Using for local development
# i94immi_dataset = 'immigration_data_sample.csv'
# i94immi_df = pd.read_csv(i94immi_dataset,sep=",")

In [7]:
i94immi_df.count()

3096313

In [8]:
i94immi_df.show()

+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|
+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|  6.0|2016.0|   4.0| 692.0| 692.0|    XXX|20573.0|   null|   null|   null|  37.0|    2.0|  1.0|    null|    null| null|      T|   null|      U|   null| 1979.0|10282016|  null|  null|   null| 1.897628485E9| null|      B2|
|  7.0|2016.0|   4.0| 254.0| 276.0|    ATL|20551.0|    1.0|     AL|   null|  25.0|    3.0|  1.0|20130811|     SE

Create a sql table view of i94 immigration dataset

In [9]:
i94immi_df.createOrReplaceTempView('i94immi_table')

### Choose Primarykey

If distinct result of '**cicid**' the same to record amount of dataset. We can use **'cicid'** as primarykey.

In [10]:
spark.sql("""
    SELECT COUNT (DISTINCT cicid)
    FROM i94immi_table
""").show()

+---------------------+
|count(DISTINCT cicid)|
+---------------------+
|              3096313|
+---------------------+



### Cleaning Arrival date and Departure date

We verify the logic of data, Departure date must be greater or equal Arrival date because:
- Columns **'arrdate'** displays the arrival date in the USA 
- Column **'depdate'** as departure date from the USA. 

We count amount of records with Departure date <= Arrival date. These are un-makesence data will be droped

In [84]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show()

+--------+
|count(1)|
+--------+
|     375|
+--------+



Show samples of un-makesence data

In [88]:
spark.sql("""
    SELECT arrdate, depdate
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show()

+-------+-------+
|arrdate|depdate|
+-------+-------+
|20545.0|20544.0|
|20546.0|20532.0|
|20546.0|20479.0|
|20546.0|20545.0|
|20546.0|20484.0|
|20546.0|20545.0|
|20547.0|20546.0|
|20548.0|20525.0|
|20549.0|20548.0|
|20549.0|20548.0|
|20550.0|20549.0|
|20550.0|20549.0|
|20550.0|20549.0|
|20550.0|20549.0|
|20550.0|20549.0|
|20551.0|20550.0|
|20551.0|20549.0|
|20552.0|20383.0|
|20552.0|20226.0|
|20552.0|20538.0|
+-------+-------+
only showing top 20 rows



We drop un-makesence logic values from i94immi dataset

In [11]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE arrdate <= depdate
""").createOrReplaceTempView("i94immi_table")

Verify cleaned arrdate and depdate

In [12]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



We add column **'arrival_date'** from **'arrdate'** base on SAS correspond timestone *1960-01-01*

In [13]:
df_date = spark.sql("""
    SELECT *, date_add(to_date('1960-01-01'), arrdate) AS arrival_date 
    FROM i94immi_table
""")

In [14]:
df_date.createOrReplaceTempView("i94immi_table")

In [15]:
spark.sql("""
    SELECT COUNT(arrival_date)
    FROM i94immi_table
""").show()

+-------------------+
|count(arrival_date)|
+-------------------+
|            2953481|
+-------------------+



And then add column **'departure_date'** column from **'depdate'** base on SAS correspond timestone *1960-01-01*

In [16]:
spark.sql("""SELECT *, CASE 
                        WHEN depdate >= arrdate THEN date_add(to_date('1960-01-01'), depdate)
                        WHEN depdate IS NULL THEN NULL
                        ELSE 'N/A' END AS departure_date 
                FROM i94immi_table
            """).createOrReplaceTempView("i94immi_table")

Verify column **'departure_date'**

In [18]:
spark.sql("""
    SELECT COUNT(departure_date)
    FROM i94immi_table
""").show()

+---------------------+
|count(departure_date)|
+---------------------+
|              2953481|
+---------------------+



Count distinct **'arrival_date'**

In [19]:
spark.sql("""SELECT MIN(arrival_date) as min_arrival_date, MAX(arrival_date) as max_arrival_date
            FROM i94immi_table
    """).show()

+----------------+----------------+
|min_arrival_date|max_arrival_date|
+----------------+----------------+
|      2016-04-01|      2016-04-30|
+----------------+----------------+



In [21]:
spark.sql("""SELECT COUNT (DISTINCT arrival_date) as distinct_arrival_date
            FROM i94immi_table
    """).show()

+---------------------+
|distinct_arrival_date|
+---------------------+
|                   30|
+---------------------+



Count distinct **'departure_date'**

In [20]:
spark.sql("""SELECT MIN(departure_date) as min_departure_date, MAX(departure_date) as max_departure_date
            FROM i94immi_table
    """).show()

+------------------+------------------+
|min_departure_date|max_departure_date|
+------------------+------------------+
|        2016-04-02|        2084-05-16|
+------------------+------------------+



In [22]:
spark.sql("""SELECT COUNT (DISTINCT departure_date) as distinct_departure_date
            FROM i94immi_table
    """).show()

+-----------------------+
|distinct_departure_date|
+-----------------------+
|                    174|
+-----------------------+



Count distinct date between **'arrival_date'** and **'departure_date'**

In [23]:
spark.sql("""SELECT COUNT(DISTINCT departure_date) as distinct_date_between
            FROM i94immi_table 
            WHERE departure_date IN (SELECT DISTINCT arrival_date 
                                    FROM i94immi_table
                                ) 
        """).show()

+---------------------+
|distinct_date_between|
+---------------------+
|                   29|
+---------------------+



Just one missing value (29/30) --> our dim_date tables will include **'arrival_date'** and **'departure_date'**

Find any NaN or Null values on 'departure_date'

In [24]:
spark.sql("SELECT count(*) FROM i94immi_table WHERE departure_date = 'N/A'").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [25]:
spark.sql("SELECT count(*) FROM i94immi_table WHERE departure_date = 'NULL'").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



Verify again

In [133]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



### Cleaning i94port

We check the column **'i94port'** and note value length of this column

In [134]:
spark.sql("""
    SELECT i94port
    FROM i94immi_table
""").show()

+-------+
|i94port|
+-------+
|    WAS|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    NYC|
|    TOR|
|    BOS|
|    ATL|
|    ATL|
|    ATL|
|    ATL|
|    HOU|
|    NYC|
|    NYC|
|    NYC|
|    MIA|
+-------+
only showing top 20 rows



In [28]:
spark.sql("""
    SELECT count(*) 
    FROM i94immi_table 
    WHERE i94port == 'NaN'
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [27]:
spark.sql("""
    SELECT count(*) 
    FROM i94immi_table 
    WHERE i94port is NULL
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



Values length of column **'i94port'** is 3 charaters

### Cleaning i94mode

Next, we take a look on arival mode as column **i94mode**

In [135]:
spark.sql("""
    SELECT i94mode, count(*)
    FROM i94immi_table
    GROUP BY i94mode
""").show()

+-------+--------+
|i94mode|count(1)|
+-------+--------+
|   null|     238|
|    1.0| 2871184|
|    3.0|   61572|
|    2.0|   17970|
|    9.0|    2517|
+-------+--------+



From I94_SAS_Labels_Descriptions_SAS we extracted **i94_mode.csv** includes info:

We keep air arrival only (**i94mode=1**), drop any arrival values else (null,, 2, 3, 9)

In [136]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94mode == 1.0
""").createOrReplaceTempView("i94immi_table")

Verify our table

In [137]:
spark.sql("""
    SELECT COUNT(*) as number_of_records
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2871184|
+--------+



### Cleaning i94visa, visatype

From **SAS_Labels**, we extracted visatype validation as **i94_visa.csv**. 

This step, we mapping **i94visa** numbers to **visatype** instead.

In [138]:
spark.sql("""
        SELECT *, CASE 
                    WHEN i94visa = 1.0 THEN 'Business' 
                    WHEN i94visa = 2.0 THEN 'Pleasure'
                    WHEN i94visa = 3.0 THEN 'Student'
                    ELSE 'N/A' END AS visa_type
        FROM i94immi_table
    """).createOrReplaceTempView("i94immi_table")

We category **visatype** grouping by **visa_type**

In [141]:
spark.sql("""
        SELECT visa_type as visa_type, visatype as visatype_code, count(*) as count_by_visa_category
        FROM i94immi_table
        GROUP BY visa_type, visatype
        ORDER BY visa_type, visatype
""").show()

+---------+--------+--------+
|visa_type|visatype|count(1)|
+---------+--------+--------+
| Business|      B1|  201741|
| Business|      E1|    3027|
| Business|      E2|   15157|
| Business|     GMB|     132|
| Business|       I|    2931|
| Business|      I1|     211|
| Business|      WB|  277414|
| Pleasure|      B2| 1008434|
| Pleasure|      CP|   11891|
| Pleasure|     CPL|       8|
| Pleasure|     GMT|   79777|
| Pleasure|     SBP|       2|
| Pleasure|      WT| 1243531|
|  Student|      F1|   24599|
|  Student|      F2|    1622|
|  Student|      M1|     679|
|  Student|      M2|      28|
+---------+--------+--------+



Find any NaN or Null values on 'visatype'

In [142]:
spark.sql("""
    SELECT count(*) as count_null_of_visatype
    FROM i94immi_table 
    WHERE visatype is NULL
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [143]:
spark.sql("""
    SELECT count(*) as count_missing_values
    FROM i94immi_table 
    WHERE visatype == 'NaN'
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



### Cleaning i94bir and biryear

Check check whether value is NULL or not

In [31]:
spark.sql("""
    SELECT count(*) as count_NULL_values
    FROM i94immi_table 
    WHERE i94bir is NULL
""").show()

+--------+
|count(1)|
+--------+
|      46|
+--------+



In [33]:
spark.sql("""
    SELECT count(*) as count_NULL_values
    FROM i94immi_table 
    WHERE biryear is NULL
""").show()

+--------+
|count(1)|
+--------+
|      46|
+--------+



In [34]:
spark.sql("""
    SELECT MAX(biryear) as max_biryear, MIN(biryear) as min_biryear
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
""").show()

+-----------+-----------+
|max_biryear|min_biryear|
+-----------+-----------+
|     2016.0|     1916.0|
+-----------+-----------+



Take a look on travel velocity for 90 years old and older

In [37]:
spark.sql("""
    SELECT biryear as birth_year, COUNT(*) as count_by_birth_year
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
    AND biryear <= 1926
    GROUP BY biryear
    ORDER BY biryear ASC
""").show()

+-------+--------+
|biryear|count(1)|
+-------+--------+
| 1916.0|       8|
| 1917.0|      16|
| 1918.0|      21|
| 1919.0|      36|
| 1920.0|      34|
| 1921.0|      69|
| 1922.0|      89|
| 1923.0|     155|
| 1924.0|     209|
| 1925.0|     274|
| 1926.0|     414|
+-------+--------+



Total of travel trips for 90 year old and older not so much. Don't worry of this

### Cleaning gender

We just user records of male = 'M' and female = 'F'

In [40]:
spark.sql("""
    SELECT * 
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").createOrReplaceTempView("i94immi_table")

In [44]:
spark.sql("""
    SELECT COUNT(gender) as gender_count
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").show()

+------------+
|gender_count|
+------------+
|     2544951|
+------------+



In [45]:
spark.sql("""
    SELECT gender as gender, COUNT(*) as count_gender
    FROM i94immi_table 
    WHERE gender IN ('F') OR gender IN ('M')
    GROUP BY gender
    ORDER BY gender ASC
""").show()

+------+------------+
|gender|count_gender|
+------+------------+
|     F|     1228646|
|     M|     1316305|
+------+------------+



### Cleaning i94cit (citizenship), i94res(residence) and i94addr (state)

Columns **'i94cit'**, **'i94res'** and **'i94addr'** are in float datatype with meaning:
- i94cit: Country of citizenship
- i94res: Country of residence
- i94addr: State code validation

Just check and drop NULL values if need for these columns

In [50]:
spark.sql("""
    SELECT count(*) as count_null_i94cit
    FROM i94immi_table
    WHERE i94cit IS NULL
""").show()

+-----------------+
|count_null_i94cit|
+-----------------+
|                0|
+-----------------+



In [51]:
spark.sql("""
    SELECT count(*) as count_null_i94res
    FROM i94immi_table
    WHERE i94res IS NULL
""").show()

+-----------------+
|count_null_i94res|
+-----------------+
|                0|
+-----------------+



In [49]:
spark.sql("""
    SELECT count(*) as count_null_i94addr
    FROM i94immi_table
    WHERE i94addr IS NULL
""").show()

+------------------+
|count_null_i94addr|
+------------------+
|            114019|
+------------------+



Do not use i94addr cause of a lot NULL values in this column

### Baseline i94immi_table of data i94 immigration

In [57]:
i94immi_df = spark.sql("""
                        SELECT *
                        FROM i94immi_table
                    """)

In [61]:
i94immi_df.head()

Row(cicid=15.0, i94yr=2016.0, i94mon=4.0, i94cit=101.0, i94res=101.0, i94port='WAS', arrdate=20545.0, i94mode=1.0, i94addr='MI', depdate=20691.0, i94bir=55.0, i94visa=2.0, count=1.0, dtadfile='20160401', visapost=None, occup=None, entdepa='T', entdepd='O', entdepu=None, matflag='M', biryear=1961.0, dtaddto='09302016', gender='M', insnum=None, airline='OS', admnum=666643185.0, fltno='93', visatype='B2', arrival_date=datetime.date(2016, 4, 1), departure_date='2016-08-25')

In [67]:
#i94immi_df.write.csv("i94immi_df_clean.csv")
i94immi_df.write.options(header='True', delimiter=',').csv("i94immi_df_clean")
#i94immi_df.write.mode('overwrite').csv("i94immi_df_clean")

In [18]:
pd.set_option('display.max_columns', 50)
i94immi_df.head(10)

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,arrive_date,departure_date
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT,2016-04-22,2016-04-29
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,20568.0,26.0,2.0,1.0,20160423,MTR,,G,R,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2,2016-04-23,2016-04-24
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,20571.0,76.0,2.0,1.0,20160407,,,G,O,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT,2016-04-07,2016-04-27
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,20581.0,25.0,2.0,1.0,20160428,DOH,,G,O,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2,2016-04-28,2016-05-07
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,20553.0,19.0,2.0,1.0,20160406,,,Z,K,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT,2016-04-06,2016-04-09
5,721257,1481650.0,2016.0,4.0,577.0,577.0,ATL,20552.0,1.0,GA,20606.0,51.0,2.0,1.0,20160408,,,T,N,,M,1965.0,10072016,M,,DL,736852600.0,910,B2,2016-04-08,2016-06-01
6,1072780,2197173.0,2016.0,4.0,245.0,245.0,SFR,20556.0,1.0,CA,20635.0,48.0,2.0,1.0,20160412,,,T,O,,M,1968.0,10112016,F,,CX,786312200.0,870,B2,2016-04-12,2016-06-30
7,112205,232708.0,2016.0,4.0,113.0,135.0,NYC,20546.0,1.0,NY,20554.0,33.0,2.0,1.0,20160402,,,G,O,,M,1983.0,6302016,F,,BA,55474490000.0,00117,WT,2016-04-02,2016-04-10
8,2577162,5227851.0,2016.0,4.0,131.0,131.0,CHI,20572.0,1.0,IL,20575.0,39.0,2.0,1.0,20160428,,,O,O,,M,1977.0,7262016,,,LX,59413420000.0,00008,WT,2016-04-28,2016-05-01
9,10930,13213.0,2016.0,4.0,116.0,116.0,LOS,20545.0,1.0,CA,20553.0,35.0,2.0,1.0,20160401,,,O,O,,M,1981.0,6292016,,,AA,55449790000.0,00109,WT,2016-04-01,2016-04-09


Verify record with missing value NA or NULL

In [20]:
i94immi_df.isna().sum()

Unnamed: 0           0
cicid                0
i94yr                0
i94mon               0
i94cit               0
i94res               0
i94port              0
arrdate              0
i94mode              0
i94addr             59
depdate             49
i94bir               0
i94visa              0
count                0
dtadfile             0
visapost           618
occup              996
entdepa              0
entdepd             46
entdepu           1000
matflag             46
biryear              0
dtaddto              0
gender             141
insnum             965
airline             33
admnum               0
fltno                8
visatype             0
arrive_date          0
departure_date      49
dtype: int64

In [21]:
i94immi_df.isnull().sum()

Unnamed: 0           0
cicid                0
i94yr                0
i94mon               0
i94cit               0
i94res               0
i94port              0
arrdate              0
i94mode              0
i94addr             59
depdate             49
i94bir               0
i94visa              0
count                0
dtadfile             0
visapost           618
occup              996
entdepa              0
entdepd             46
entdepu           1000
matflag             46
biryear              0
dtaddto              0
gender             141
insnum             965
airline             33
admnum               0
fltno                8
visatype             0
arrive_date          0
departure_date      49
dtype: int64

Remove records with missing value on column 'depdate'

In [25]:
i94immi_df = i94immi_df.dropna(subset="i94yr",how="all")

In [26]:
i94immi_df = i94immi_df.dropna(subset="i94mon",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="arrdate",how="all")

In [None]:
i94immi_df = i94immi_df.dropna(subset="depdate",how="all")

In [23]:
i94immi_df = i94immi_df.dropna(subset="i94addr",how="all")

In [27]:
i94immi_df = i94immi_df.dropna(subset="i94mode",how="all")

In [28]:
i94immi_df = i94immi_df.dropna(subset="dtaddto",how="all")

In [29]:
i94immi_df.count()

Unnamed: 0        899
cicid             899
i94yr             899
i94mon            899
i94cit            899
i94res            899
i94port           899
arrdate           899
i94mode           899
i94addr           899
depdate           899
i94bir            899
i94visa           899
count             899
dtadfile          899
visapost          345
occup               3
entdepa           899
entdepd           899
entdepu             0
matflag           899
biryear           899
dtaddto           899
gender            775
insnum             18
airline           880
admnum            899
fltno             898
visatype          899
arrive_date       899
departure_date    899
dtype: int64

In [30]:
i94immi_df.isnull().sum()

Unnamed: 0          0
cicid               0
i94yr               0
i94mon              0
i94cit              0
i94res              0
i94port             0
arrdate             0
i94mode             0
i94addr             0
depdate             0
i94bir              0
i94visa             0
count               0
dtadfile            0
visapost          554
occup             896
entdepa             0
entdepd             0
entdepu           899
matflag             0
biryear             0
dtaddto             0
gender            124
insnum            881
airline            19
admnum              0
fltno               1
visatype            0
arrive_date         0
departure_date      0
dtype: int64

In [None]:
i94immi_df = i94immi_df.drop(['count', 'tadfile', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu', 'matflag'], axis=1)

=====================================================================================================

=====================================================================================================

In [20]:
# Convert year and month to int64 data type
i94immi_df['cicid'] = i94immi_df['cicid'].astype(int)
i94immi_df['i94yr'] = i94immi_df['i94yr'].astype(int)
i94immi_df['i94mon'] = i94immi_df['i94mon'].astype(int)
i94immi_df['i94cit'] = i94immi_df['i94cit'].astype(int)
i94immi_df['i94mode'] = i94immi_df['i94mode'].astype(int)
i94immi_df['arrdate'] = i94immi_df['arrdate'].astype(int)
i94immi_df['depdate'] = i94immi_df['depdate'].astype(int)
i94immi_df['i94visa'] = i94immi_df['i94visa'].astype(int)

i94immi_df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
0,4084316.0,2016,4,209,20566,1,20573,2,WT
1,4422636.0,2016,4,582,20567,1,20568,2,B2
2,1195600.0,2016,4,148,20551,1,20571,2,WT
3,5291768.0,2016,4,297,20572,1,20581,2,B2
4,985523.0,2016,4,111,20550,3,20553,2,WT


In [7]:
i94immi_columns = ["cicid","i94yr","i94mon","i94cit","arrdate","i94mode","depdate","i94visa", 'visatype']
i94immi_df = i94immi_df[i94immi_columns]


In [6]:
# Show dataset sample records
i94immi_df = i94immi_df.set_index('cicid')
i94immi_df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
0,4084316.0,2016.0,4.0,209.0,20566.0,1.0,20573.0,2.0,WT
1,4422636.0,2016.0,4.0,582.0,20567.0,1.0,20568.0,2.0,B2
2,1195600.0,2016.0,4.0,148.0,20551.0,1.0,20571.0,2.0,WT
3,5291768.0,2016.0,4.0,297.0,20572.0,1.0,20581.0,2.0,B2
4,985523.0,2016.0,4.0,111.0,20550.0,3.0,20553.0,2.0,WT


In [9]:
i94immi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   cicid     1000 non-null   float64
 1   i94yr     1000 non-null   float64
 2   i94mon    1000 non-null   float64
 3   i94cit    1000 non-null   float64
 4   arrdate   1000 non-null   float64
 5   i94mode   1000 non-null   float64
 6   depdate   951 non-null    float64
 7   i94visa   1000 non-null   float64
 8   visatype  1000 non-null   object 
dtypes: float64(8), object(1)
memory usage: 70.4+ KB


In [22]:
# Show dataset sample records
i94immi_df.shape

(951, 9)

In [23]:
i94immi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 951 entries, 0 to 999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cicid     951 non-null    object
 1   i94yr     951 non-null    int32 
 2   i94mon    951 non-null    int32 
 3   i94cit    951 non-null    int32 
 4   arrdate   951 non-null    int32 
 5   i94mode   951 non-null    int32 
 6   depdate   951 non-null    int32 
 7   i94visa   951 non-null    int32 
 8   visatype  951 non-null    object
dtypes: int32(7), object(2)
memory usage: 48.3+ KB


In [31]:
# Check unique
i94immi_df[i94immi_columns].value_counts()

cicid      i94yr  i94mon  i94cit  arrdate  i94mode  depdate  i94visa  visatype
1000074.0  2016   4       129     20550    1        20564    2        WT          1
480428.0   2016   4       148     20547    1        20559    2        WT          1
4718122.0  2016   4       209     20569    1        20573    2        WT          1
4718538.0  2016   4       209     20569    1        20574    2        WT          1
4729596.0  2016   4       245     20569    1        20584    2        B2          1
                                                                                 ..
2863583.0  2016   4       689     20559    1        20569    2        B2          1
2865787.0  2016   4       691     20559    1        20566    1        B1          1
2865828.0  2016   4       691     20559    1        20600    2        B2          1
2867437.0  2016   4       691     20559    1        20590    2        B2          1
999282.0   2016   4       129     20550    1        20553    1        WB         

In [None]:
# Check unique
i94immi_df[i94immi_columns].nunique()

cicid       951
i94yr         1
i94mon        1
i94cit       87
arrdate      30
i94mode       3
depdate     109
i94visa       3
visatype      8
dtype: int64

In [33]:
i94immi_df[i94immi_columns].sort_values('arrdate',na_position="last")

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
215,25478.0,2016,4,131,20545,1,20633,2,WT
770,67523.0,2016,4,245,20545,1,20560,1,B1
244,86265.0,2016,4,368,20545,1,20553,2,B2
867,18310.0,2016,4,123,20545,1,20548,2,WT
665,32582.0,2016,4,135,20545,1,20552,2,WT
...,...,...,...,...,...,...,...,...,...
115,5883463.0,2016,4,687,20574,1,20589,2,B2
109,5756066.0,2016,4,260,20574,3,20576,2,B2
256,5899181.0,2016,4,696,20574,1,20583,1,B1
371,6057910.0,2016,4,252,20574,1,20578,2,GMT


In [34]:
i94immi_df.sort_values(by=['visatype','cicid'], ascending=True)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
892,1215382.0,2016,4,245,20551,1,20569,1,B1
421,1330915.0,2016,4,687,20551,1,20558,1,B1
126,1346007.0,2016,4,746,20551,1,20566,1,B1
627,1346274.0,2016,4,746,20551,1,20554,1,B1
123,1643294.0,2016,4,273,20553,1,20558,1,B1
...,...,...,...,...,...,...,...,...,...
420,982263.0,2016,4,103,20550,1,20592,2,WT
848,982461.0,2016,4,103,20550,1,20551,2,WT
4,985523.0,2016,4,111,20550,3,20553,2,WT
897,991350.0,2016,4,111,20550,1,20555,2,WT


In [None]:
# Check unique
i94immi_df[i94immi_columns].value_counts()

cicid      i94yr  i94mon  i94cit  arrdate  i94mode  depdate  i94visa  visatype
1000074.0  2016   4       129     20550    1        20564    2        WT          1
480428.0   2016   4       148     20547    1        20559    2        WT          1
4718122.0  2016   4       209     20569    1        20573    2        WT          1
4718538.0  2016   4       209     20569    1        20574    2        WT          1
4729596.0  2016   4       245     20569    1        20584    2        B2          1
                                                                                 ..
2863583.0  2016   4       689     20559    1        20569    2        B2          1
2865787.0  2016   4       691     20559    1        20566    1        B1          1
2865828.0  2016   4       691     20559    1        20600    2        B2          1
2867437.0  2016   4       691     20559    1        20590    2        B2          1
999282.0   2016   4       129     20550    1        20553    1        WB         

In [35]:
i94immi_df['visatype'].value_counts(normalize=True)*100

WT     45.215563
B2     34.700315
WB      9.463722
B1      6.414301
GMT     2.523659
F1      0.841220
CP      0.525762
E2      0.315457
Name: visatype, dtype: float64

In [44]:
i94immi_df["visaranking"] = i94immi_df["visatype"].rank(ascending = True).astype("int")
i94immi_df.sort_values(by=['visatype','Visa Ranking'], ascending=True)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype,Visa Ranking
30,5692439.0,2016,4,133,20574,1,20580,1,B1,31
34,4805034.0,2016,4,582,20569,1,20573,1,B1,31
41,692716.0,2016,4,245,20548,1,20651,1,B1,31
68,3293058.0,2016,4,691,20561,1,20565,1,B1,31
94,95870.0,2016,4,528,20545,1,20549,1,B1,31
...,...,...,...,...,...,...,...,...,...,...
989,1360834.0,2016,4,117,20552,1,20556,2,WT,736
992,3874218.0,2016,4,148,20565,1,20582,2,WT,736
994,5081809.0,2016,4,254,20571,1,20582,2,WT,736
995,4288772.0,2016,4,135,20567,1,20572,2,WT,736


In [45]:
i94immi_df['visatype'].value_counts()

WT     430
B2     330
WB      90
B1      61
GMT     24
F1       8
CP       5
E2       3
Name: visatype, dtype: int64

In [None]:
# Write to parquet partitioned by arrdate - Run on production
i94immi_df.write.partitionBy("arrdate").parquet(os.path.join(output_data, table), mode="overwrite")