In [35]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

### Parse config file for path configurations

In [36]:
config = configparser.ConfigParser()
config.read('etl.cfg')

input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')

### Create Spark session

In [37]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

### Gathering I94 Immigration

In [38]:
i94immi_dataset

'../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'

In [39]:
i94_immi_df = spark.read.format('com.github.saurfang.sas.spark').load(i94immi_dataset)

In [40]:
# Data type
i94_immi_df.dtypes

[('cicid', 'double'),
 ('i94yr', 'double'),
 ('i94mon', 'double'),
 ('i94cit', 'double'),
 ('i94res', 'double'),
 ('i94port', 'string'),
 ('arrdate', 'double'),
 ('i94mode', 'double'),
 ('i94addr', 'string'),
 ('depdate', 'double'),
 ('i94bir', 'double'),
 ('i94visa', 'double'),
 ('count', 'double'),
 ('dtadfile', 'string'),
 ('visapost', 'string'),
 ('occup', 'string'),
 ('entdepa', 'string'),
 ('entdepd', 'string'),
 ('entdepu', 'string'),
 ('matflag', 'string'),
 ('biryear', 'double'),
 ('dtaddto', 'string'),
 ('gender', 'string'),
 ('insnum', 'string'),
 ('airline', 'string'),
 ('admnum', 'double'),
 ('fltno', 'string'),
 ('visatype', 'string')]

In [41]:
# Sample records
pd.set_option('display.max_columns', 50)
i94_immi_df.show(3)

+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|       admnum|fltno|visatype|
+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|  6.0|2016.0|   4.0| 692.0| 692.0|    XXX|20573.0|   null|   null|   null|  37.0|    2.0|  1.0|    null|    null| null|      T|   null|      U|   null| 1979.0|10282016|  null|  null|   null|1.897628485E9| null|      B2|
|  7.0|2016.0|   4.0| 254.0| 276.0|    ATL|20551.0|    1.0|     AL|   null|  25.0|    3.0|  1.0|20130811|     SEO| n

In [44]:
# Sample records
i94_immi_df.count()

3096313

### Garthering World Temperature Data

In [47]:
worldtempe_dataset = '../../data2/GlobalLandTemperaturesByCity.csv'

# Using this dataset for local development
# worldtempe_dataset = 'GlobalLandTemperaturesByCity_part9.csv'
worldtempe_df = pd.read_csv(worldtempe_dataset,sep=",")

In [None]:
# Show schema and datatype
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             100000 non-null  object 
 1   AverageTemperature             96552 non-null   float64
 2   AverageTemperatureUncertainty  96552 non-null   float64
 3   City                           100000 non-null  object 
 4   Country                        100000 non-null  object 
 5   Latitude                       100000 non-null  object 
 6   Longitude                      100000 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.3+ MB


In [48]:
# Show size of dataset
worldtempe_df.shape

(8599212, 7)

In [50]:
# Show dataset sample records
pd.set_option('display.max_columns', 50)
worldtempe_df.head(10)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1748-09-01,,,Belgorod,Russia,50.63N,36.76E
1,1748-10-01,,,Belgorod,Russia,50.63N,36.76E
2,1748-11-01,,,Belgorod,Russia,50.63N,36.76E
3,1748-12-01,,,Belgorod,Russia,50.63N,36.76E
4,1749-01-01,,,Belgorod,Russia,50.63N,36.76E
5,1749-02-01,,,Belgorod,Russia,50.63N,36.76E
6,1749-03-01,,,Belgorod,Russia,50.63N,36.76E
7,1749-04-01,,,Belgorod,Russia,50.63N,36.76E
8,1749-05-01,,,Belgorod,Russia,50.63N,36.76E
9,1749-06-01,,,Belgorod,Russia,50.63N,36.76E


In [None]:
worldtempe_df.describe()

In [51]:
# N/A values estimation
worldtempe_df.isna().sum()

dt                                  0
AverageTemperature               3448
AverageTemperatureUncertainty    3448
City                                0
Country                             0
Latitude                            0
Longitude                           0
dtype: int64

In [52]:
# NULL values estimation
worldtempe_df.isnull().sum()

dt                                  0
AverageTemperature               3448
AverageTemperatureUncertainty    3448
City                                0
Country                             0
Latitude                            0
Longitude                           0
dtype: int64

### Garthering I94_SAS_Labels_Descriptions.SAS

- Gather informations from `I94_SAS_Labels_Descriptions.SAS`, there are parts:
    - I94RES&I94CIT
    - **I94PORT** --> **i94_port.csv** (use this only for project)
    - I94MODE
    - I94ADDR
    - I94VISA

We will handle this dataset by a block of code to parse informations in Step 2. Explore and the Data