In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime, timedelta

from etl import SparkETL

In [2]:
etl = SparkETL()

In [3]:
# for sas lib version see https://github.com/saurfang/spark-sas7bdat

spark = SparkSession.builder.appName("de-capstone")\
.config('spark.jars.repositories', 'https://repos.spark-packages.org/')\
.config('spark.jars.packages', 'saurfang:spark-sas7bdat:3.0.0-s_2.12')\
.getOrCreate()

https://repos.spark-packages.org/ added as a remote repository with the name: repo-1


:: loading settings :: url = jar:file:/usr/local/Cellar/apache-spark/3.1.2/libexec/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/charly/.ivy2/cache
The jars for the packages stored in: /Users/charly/.ivy2/jars
saurfang#spark-sas7bdat added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6e7d8e42-33eb-4341-8961-3d30fb9db4bb;1.0
	confs: [default]
	found saurfang#spark-sas7bdat;3.0.0-s_2.12 in spark-packages
	found com.epam#parso;2.0.11 in central
	found org.slf4j#slf4j-api;1.7.5 in central
	found org.apache.logging.log4j#log4j-api-scala_2.12;12.0 in central
	found org.scala-lang#scala-reflect;2.12.10 in user-list
	found org.apache.logging.log4j#log4j-api;2.13.2 in central
:: resolution report :: resolve 286ms :: artifacts dl 8ms
	:: modules in use:
	com.epam#parso;2.0.11 from central in [default]
	org.apache.logging.log4j#log4j-api;2.13.2 from central in [default]
	org.apache.logging.log4j#log4j-api-scala_2.12;12.0 from central in [default]
	org.scala-lang#scala-reflect;2.12.10 from user-list in [default]
	org.slf4j#slf4j-api;1.7.5 from central in 

In [4]:
dir = '/Users/charly/DataEng2022/de-capstone/data/18-83510-I94-Data-2016'
parquet_file = '/Users/charly/DataEng2022/de-capstone/datalake/raw/immigration'

In [5]:
def parse_date(date):
    return datetime.strptime(date, '%Y-%m-%d')

In [6]:
def sas_file_path(date):
    """
    Example: 'i94_jan16_sub.sas7bdat'
    """
    year = datetime.strftime(date, '%y')
    month = datetime.strftime(date, '%b').lower()
    file = f"i94_{month}{year}_sub.sas7bdat"
    return f"{dir}/{file}"

In [7]:
schema = T.StructType([
    T.StructField('cicid', T.FloatType(), True),
    T.StructField('i94yr', T.FloatType(), True),
    T.StructField('i94mon', T.FloatType(), True),
    T.StructField('i94cit', T.FloatType(), True),
    T.StructField('i94res', T.FloatType(), True),
    T.StructField('i94port', T.StringType(), True),
    T.StructField('arrdate', T.FloatType(), True),
    T.StructField('i94mode', T.FloatType(), True),
    T.StructField('i94addr', T.StringType(), True),
    T.StructField('depdate', T.FloatType(), True),
    T.StructField('i94bir', T.FloatType(), True),
    T.StructField('i94visa', T.FloatType(), True),
    T.StructField('count', T.FloatType(), True),
    T.StructField('dtadfile', T.StringType(), True),
    T.StructField('visapost', T.StringType(), True),
    T.StructField('occup', T.StringType(), True),
    T.StructField('entdepa', T.StringType(), True),
    T.StructField('entdepd', T.StringType(), True),
    T.StructField('entdepu', T.StringType(), True),
    T.StructField('matflag', T.StringType(), True),
    T.StructField('biryear', T.FloatType(), True),
    T.StructField('dtaddto', T.StringType(), True),
    T.StructField('gender', T.StringType(), True),
    T.StructField('insnum', T.StringType(), True),
    T.StructField('airline', T.StringType(), True),
    T.StructField('admnum', T.FloatType(), True),
    T.StructField('fltno', T.StringType(), True),
    T.StructField('visatype', T.StringType(), True)
])

In [8]:
def read_sas_file(path):
    # see https://stackoverflow.com/questions/35684856/import-pyspark-packages-with-a-regular-jupyter-notebook
    return (
        spark
        .read
        .format('com.github.saurfang.sas.spark')
        .schema(schema)
        .load(path)
    ) 

In [9]:
date_sql = "DATE_ADD('1960-01-01', CAST(arrdate AS INT))"

def project_partition_cols(df):
    return (
        df
        .withColumn('year', F.expr(f"YEAR({date_sql})"))
        .withColumn('month_id', F.expr(f"MONTH({date_sql})"))
        .withColumn('day', F.expr(f"DAY({date_sql})"))
    )

In [10]:
def save_partitioned_immigration(df):
    (
        df
        .write
        .mode('append')
        .partitionBy('year', 'month_id')
        .save(parquet_file)
    )  

In [11]:
def split_sas(date):
    return (
        read_sas_file(
            sas_file_path(
                parse_date(date)
            )
        )
        .pipe(project_partition_cols)
        .pipe(save_partitioned_immigration)
    )

In [16]:
# TODO
# month 6 java.lang.OutOfMemoryError: Java heap space
#  Provided schema has 28 but SAS file has 34 columns.

In [21]:
split_sas('2016-12-01')

22/05/12 20:37:51 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
22/05/12 20:37:51 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
22/05/12 20:37:51 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
22/05/12 20:37:52 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
22/05/12 20:37:52 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
22/05/12 20:37:55 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
22/05/12 20:37:55 WARN MemoryManager: Total allocation exceeds 95.

In [18]:
(
    spark
    .read
    .load(parquet_file)
).limit(1000).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,day,year,month_id
0,5232413.0,2016.0,7.0,696.0,696.0,HOU,20657.0,1.0,NY,20676.0,26.0,2.0,1.0,20160811,,,G,N,,M,1990.0,01212017,M,103397,UA,2.508675e+09,1031,B2,22,2016,7
1,5540916.0,2016.0,7.0,523.0,523.0,NYC,20658.0,1.0,CT,20687.0,7.0,2.0,1.0,20160821,,,O,O,,M,2009.0,01222017,M,314895,B6,2.788912e+09,882,B2,23,2016,7
2,5232414.0,2016.0,7.0,148.0,112.0,HOU,20657.0,1.0,TX,20666.0,50.0,1.0,1.0,20160801,,,G,Q,,M,1966.0,10192016,M,45161,UA,2.508676e+09,1031,WB,22,2016,7
3,5540919.0,2016.0,7.0,123.0,123.0,BOS,20658.0,1.0,MA,20661.0,16.0,2.0,1.0,20160726,,,G,K,,M,2000.0,10202016,F,647544,BA,2.788914e+09,203,WT,23,2016,7
4,5232415.0,2016.0,7.0,261.0,261.0,SFR,20657.0,1.0,CA,20730.0,21.0,2.0,1.0,20160722,,,T,O,,M,1995.0,01212017,M,,EY,2.508677e+09,183,B2,22,2016,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,5541595.0,2016.0,7.0,696.0,696.0,MIA,20658.0,1.0,FL,20667.0,32.0,1.0,1.0,20160801,,,G,O,,M,1984.0,01222017,M,302946,AA,2.762305e+09,968,B1,23,2016,7
996,5233076.0,2016.0,7.0,254.0,276.0,LOS,20657.0,1.0,CA,20664.0,44.0,2.0,1.0,20160729,,,G,O,,M,1972.0,10192016,F,309043,OZ,2.483576e+09,202,WT,22,2016,7
997,5541596.0,2016.0,7.0,135.0,135.0,SFR,20658.0,1.0,CA,20674.0,12.0,2.0,1.0,20160808,,,G,O,,M,2004.0,10202016,F,305070,VS,2.762305e+09,19,WT,23,2016,7
998,5233077.0,2016.0,7.0,582.0,582.0,LVG,20657.0,1.0,UT,20670.0,42.0,2.0,1.0,20160805,,,G,O,,M,1974.0,01212017,F,107245,Y4,2.483577e+09,966,B2,22,2016,7


In [19]:
(
    spark
    .read
    .load(parquet_file)
).count()

                                                                                

37215540