In [1]:
# Do all imports and installs here
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pathlib import Path

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.\
config("spark.jars.repositories", "https://repos.spark-packages.org/").\
config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11").\
enableHiveSupport().getOrCreate()

df_spark = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')


In [3]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

In [4]:
# Read in the data here
i94immi_df=spark.read.parquet("sas_data")

In [5]:
df_spark.show(3)

+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|       admnum|fltno|visatype|
+-----+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+-------------+-----+--------+
|  6.0|2016.0|   4.0| 692.0| 692.0|    XXX|20573.0|   null|   null|   null|  37.0|    2.0|  1.0|    null|    null| null|      T|   null|      U|   null| 1979.0|10282016|  null|  null|   null|1.897628485E9| null|      B2|
|  7.0|2016.0|   4.0| 254.0| 276.0|    ATL|20551.0|    1.0|     AL|   null|  25.0|    3.0|  1.0|20130811|     SEO| n

In [6]:
df_spark.count()

3096313

In [7]:
i94immi_df.createOrReplaceTempView('i94immi_table')

### Choose Primarykey

If distinct result of `cicid` the same to record amount of dataset. We can use `cicid` as primarykey.

In [8]:
spark.sql("""
    SELECT COUNT (DISTINCT cicid)
    FROM i94immi_table
""").show()

+---------------------+
|count(DISTINCT cicid)|
+---------------------+
|              3096313|
+---------------------+



### Cleaning Arrival date and Departure date

We verify the logic of data, Departure date must be greater or equal Arrival date because:
- Columns `arrdate` displays the arrival date in the USA 
- Column `depdate` as departure date from the USA. 

We count amount of records with `DepartureDate >= ArrivalDate`. These are un-makesence data will be droped

In [9]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show()

+--------+
|count(1)|
+--------+
|     375|
+--------+



In [10]:
spark.sql("""
    SELECT arrdate, depdate
    FROM i94immi_table
    WHERE arrdate >= depdate
""").show(5)

+-------+-------+
|arrdate|depdate|
+-------+-------+
|20574.0|20573.0|
|20574.0|20572.0|
|20574.0|20573.0|
|20549.0|19097.0|
|20549.0|20527.0|
+-------+-------+
only showing top 5 rows



In [12]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE arrdate <= depdate
""").createOrReplaceTempView("i94immi_table")

Verify cleaned `arrdate` and `depdate`

In [13]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



Add column `arrival_date = timestone + arrdate_offset_day`, with:
- timestone = '1960-01-01' (***datetime*** datatype)
- arrdate_offset_day = 'arrdate' (***integer*** datatype)
- arrival_date (***datetime*** datatype)

In [14]:
spark.sql("""
    SELECT *, date_add(to_date('1960-01-01'), arrdate) AS arrival_date 
    FROM i94immi_table
""").createOrReplaceTempView("i94immi_table")

In [15]:
spark.sql("""
    SELECT COUNT(arrival_date)
    FROM i94immi_table
""").show()

+-------------------+
|count(arrival_date)|
+-------------------+
|            2953481|
+-------------------+



In [16]:
spark.sql("""
    SELECT arrival_date
    FROM i94immi_table
""").show(5)

+------------+
|arrival_date|
+------------+
|  2016-04-30|
|  2016-04-30|
|  2016-04-30|
|  2016-04-30|
|  2016-04-30|
+------------+
only showing top 5 rows



Add column `departure_date = timestone + depdate_offset_day`, with:
- `timestone` = '1960-01-01' (***datetime*** datatype)
- `depdate_offset_day` = 'depdate' (***integer*** datatype)
- `departure_date` (***datetime*** datatype)

In [17]:
spark.sql("""SELECT *, CASE 
                        WHEN depdate >= arrdate THEN date_add(to_date('1960-01-01'), depdate)
                        WHEN depdate IS NULL THEN NULL
                        ELSE 'NaN' END AS departure_date 
                FROM i94immi_table
            """).createOrReplaceTempView("i94immi_table")

In [18]:
spark.sql("""
    SELECT departure_date
    FROM i94immi_table
""").show(5)

+--------------+
|departure_date|
+--------------+
|    2016-05-08|
|    2016-05-17|
|    2016-05-08|
|    2016-05-14|
|    2016-05-14|
+--------------+
only showing top 5 rows



In [19]:
spark.sql("""
    SELECT COUNT(departure_date)
    FROM i94immi_table
""").show()

+---------------------+
|count(departure_date)|
+---------------------+
|              2953481|
+---------------------+



Range of time `arrival_date`

In [20]:
spark.sql("""SELECT MIN(arrival_date) as min_arrival_date, MAX(arrival_date) as max_arrival_date
            FROM i94immi_table
    """).show()

+----------------+----------------+
|min_arrival_date|max_arrival_date|
+----------------+----------------+
|      2016-04-01|      2016-04-30|
+----------------+----------------+



In [21]:
spark.sql("""
    SELECT COUNT(*) as count_null_arrival_date
    FROM i94immi_table
    WHERE arrival_date is NULL
""").show()

+-----------------------+
|count_null_arrival_date|
+-----------------------+
|                      0|
+-----------------------+



Range of time `departure_date`

In [22]:
spark.sql("""SELECT MIN(departure_date) as min_departure_date, MAX(departure_date) as max_departure_date
            FROM i94immi_table
    """).show()

+------------------+------------------+
|min_departure_date|max_departure_date|
+------------------+------------------+
|        2016-04-02|        2084-05-16|
+------------------+------------------+



Count wrong departure date `departure_date < '2016-04-01'` or `departure_date > '2018-12-31'`

In [24]:
spark.sql("""
    SELECT COUNT(departure_date)
    FROM i94immi_table
    WHERE departure_date < '2016-04-01' OR departure_date > '2018-12-31'
""").show()

+---------------------+
|count(departure_date)|
+---------------------+
|                    3|
+---------------------+



Just 3 wrong departure date values. No need to do with these.

Find any NaN or Null values on `departure_date`

In [25]:
spark.sql("""
    SELECT COUNT(*) as count_null_departure_date
    FROM i94immi_table
    WHERE departure_date is NULL
""").show()

+-------------------------+
|count_null_departure_date|
+-------------------------+
|                        0|
+-------------------------+



Verify again

In [26]:
spark.sql("""
    SELECT COUNT(*)
    FROM i94immi_table
""").show()

+--------+
|count(1)|
+--------+
| 2953481|
+--------+



In [27]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").createOrReplaceTempView("i94immi_table")

### Cleaning i94port

Check the column `i94port` and note value length of this column

In [28]:
spark.sql("""
    SELECT i94port
    FROM i94immi_table
""").show(3)

+-------+
|i94port|
+-------+
|    LOS|
|    LOS|
|    LOS|
+-------+
only showing top 3 rows



In [28]:
spark.sql("""
    SELECT count(*) as count_null_i94port
    FROM i94immi_table 
    WHERE i94port is NULL
""").show()

+------------------+
|count_null_i94port|
+------------------+
|                 0|
+------------------+



### Cleaning i94mode

Next, we take a look on arival mode as column `i94mode`

In [29]:
spark.sql("""
    SELECT i94mode, count(*) as count_by_i94mode
    FROM i94immi_table
    GROUP BY i94mode
""").show()

+-------+----------------+
|i94mode|count_by_i94mode|
+-------+----------------+
|   null|             238|
|    1.0|         2871184|
|    3.0|           61572|
|    2.0|           17970|
|    9.0|            2517|
+-------+----------------+



From `I94_SAS_Labels_Descriptions_SAS` we extracted `i94mode_sas_label_validation.csv` includes info:

We keep air arrival only, mean `i94mode=1`, drop any arrival values else (null,, 2, 3, 9)

In [30]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94mode == 1.0
""").createOrReplaceTempView("i94immi_table")

In [31]:
spark.sql("""
    SELECT i94mode, count(*) as count_by_i94mode
    FROM i94immi_table
    GROUP BY i94mode
""").show()

+-------+----------------+
|i94mode|count_by_i94mode|
+-------+----------------+
|    1.0|         2871184|
+-------+----------------+



Verify our table

In [32]:
spark.sql("""
    SELECT COUNT(*) as number_of_records
    FROM i94immi_table
""").show()

+-----------------+
|number_of_records|
+-----------------+
|          2871184|
+-----------------+



### Cleaning i94visa, visatype

From `I94_SAS_Labels_Descriptions_SAS` we extracted `i94visa_sas_label_validation.csv` includes info:

This step, we mapping `i94visa` numbers to `visatype` instead.

In [33]:
spark.sql("""
        SELECT *, CASE 
                    WHEN i94visa = 1.0 THEN 'Business' 
                    WHEN i94visa = 2.0 THEN 'Pleasure'
                    WHEN i94visa = 3.0 THEN 'Student'
                    ELSE 'NaN' END AS visa_type
        FROM i94immi_table
    """).createOrReplaceTempView("i94immi_table")

In [34]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).show(3)

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|visa_type|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E1

We category `visatype` grouping by `visa_type`

In [35]:
spark.sql("""
        SELECT visa_type as visa_type, visatype as visatype_code, count(*) as count_by_visa_category
        FROM i94immi_table
        GROUP BY visa_type, visatype
        ORDER BY visa_type, visatype
""").show(5)

+---------+-------------+----------------------+
|visa_type|visatype_code|count_by_visa_category|
+---------+-------------+----------------------+
| Business|           B1|                201741|
| Business|           E1|                  3027|
| Business|           E2|                 15157|
| Business|          GMB|                   132|
| Business|            I|                  2931|
+---------+-------------+----------------------+
only showing top 5 rows



Find any *NaN* or *NULL* values on `visatype`

In [36]:
spark.sql("""
    SELECT count(*) as count_null_of_visatype
    FROM i94immi_table 
    WHERE visatype is NULL
""").show()

+----------------------+
|count_null_of_visatype|
+----------------------+
|                     0|
+----------------------+



In [37]:
spark.sql("""
    SELECT count(*) as count_missing_values
    FROM i94immi_table 
    WHERE visatype == 'NaN'
""").show()

+--------------------+
|count_missing_values|
+--------------------+
|                   0|
+--------------------+



In [38]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).show(3)

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|visa_type|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E1

In [39]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).createOrReplaceTempView("i94immi_table")

### Cleaning i94bir and biryear

Check check whether value is *NULL* or not

In [40]:
spark.sql("""
    SELECT count(*) as count_NULL_values
    FROM i94immi_table 
    WHERE i94bir is NULL OR biryear is NULL
""").show()

+-----------------+
|count_NULL_values|
+-----------------+
|               42|
+-----------------+



In [41]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94bir is not NULL OR biryear is not NULL
""").createOrReplaceTempView("i94immi_table")

In [42]:
spark.sql("""
    SELECT MAX(biryear) as max_biryear, MIN(biryear) as min_biryear
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
""").show()

+-----------+-----------+
|max_biryear|min_biryear|
+-----------+-----------+
|     2016.0|     1916.0|
+-----------+-----------+



Take a look on travel velocity for _**>= 90 years old traveller**_

In [43]:
spark.sql("""
    SELECT biryear as birth_year, COUNT(*) as count_by_birth_year
    FROM i94immi_table 
    WHERE biryear IS NOT NULL
    AND biryear <= 1926
    GROUP BY biryear
    ORDER BY biryear ASC
""").show()

+----------+-------------------+
|birth_year|count_by_birth_year|
+----------+-------------------+
|    1916.0|                  7|
|    1917.0|                 16|
|    1918.0|                 21|
|    1919.0|                 35|
|    1920.0|                 34|
|    1921.0|                 66|
|    1922.0|                 86|
|    1923.0|                154|
|    1924.0|                199|
|    1925.0|                262|
|    1926.0|                396|
+----------+-------------------+



Total of travel trips for _**>= 90 year old travaller**_ not much. Don't worry of this kind of travaller age range.

### Cleaning gender

We just user records of `male = 'M'` and `female = 'F'`

In [44]:
spark.sql("""
    SELECT * 
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").createOrReplaceTempView("i94immi_table")

In [45]:
spark.sql("""
    SELECT COUNT(gender) as gender_count
    FROM i94immi_table 
    WHERE gender IN ('F', 'M')
""").show()

+------------+
|gender_count|
+------------+
|     2465314|
+------------+



In [46]:
spark.sql("""
    SELECT gender as gender, COUNT(*) as count_gender
    FROM i94immi_table 
    WHERE gender IN ('F') OR gender IN ('M')
    GROUP BY gender
    ORDER BY gender ASC
""").show()

+------+------------+
|gender|count_gender|
+------+------------+
|     F|     1190428|
|     M|     1274886|
+------+------------+



### Cleaning `i94cit` - citizenship, `i94res` - residence and `i94addr` - state

Columns `i94cit` `i94res` and `i94addr` are in float datatype with meaning:
- `i94cit`: Country of citizenship
- `i94res`: Country of residence
- `i94addr`: State code validation

Just check and drop *NULL* values if need for these columns

In [47]:
spark.sql("""
    SELECT count(*) as count_null_i94cit
    FROM i94immi_table
    WHERE i94cit IS NULL
""").show()

+-----------------+
|count_null_i94cit|
+-----------------+
|                0|
+-----------------+



In [48]:
spark.sql("""
    SELECT count(*) as count_null_i94res
    FROM i94immi_table
    WHERE i94res IS NULL
""").show()

+-----------------+
|count_null_i94res|
+-----------------+
|                0|
+-----------------+



In [49]:
spark.sql("""
    SELECT count(*) as count_null_i94addr
    FROM i94immi_table
    WHERE i94addr IS NULL
""").show()

+------------------+
|count_null_i94addr|
+------------------+
|             87437|
+------------------+



In [50]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    WHERE i94addr IS NOT NULL
""").createOrReplaceTempView("i94immi_table")

### Baseline `i94immi_table` of I94 IMMIGRATION dataset

In [51]:
spark.sql("""
    SELECT *
    FROM i94immi_table
    """).show(5)

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|visa_type|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+---------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E1

### Select columns that using for staging dataframe

In [52]:
spark.sql("""
            SELECT cicid, i94yr, i94mon, arrival_date, i94res, i94port, arrdate, i94addr, departure_date
            FROM i94immi_table
        """).createOrReplaceTempView("i94immi_table")

In [53]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").show(5)

+---------+------+------+------------+------+-------+-------+-------+--------------+
|    cicid| i94yr|i94mon|arrival_date|i94res|i94port|arrdate|i94addr|departure_date|
+---------+------+------+------------+------+-------+-------+-------+--------------+
|5748517.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     CA|    2016-05-08|
|5748518.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     NV|    2016-05-17|
|5748519.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-08|
|5748520.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
|5748521.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
+---------+------+------+------------+------+-------+-------+-------+--------------+
only showing top 5 rows



In [54]:
spark.sql("""
    SELECT *
    FROM i94immi_table
""").count()

2377877

In [55]:
i94immi_df = spark.sql("""
    SELECT *
    FROM i94immi_table
""")

In [56]:
i94immi_df.show(5)

+---------+------+------+------------+------+-------+-------+-------+--------------+
|    cicid| i94yr|i94mon|arrival_date|i94res|i94port|arrdate|i94addr|departure_date|
+---------+------+------+------------+------+-------+-------+-------+--------------+
|5748517.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     CA|    2016-05-08|
|5748518.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     NV|    2016-05-17|
|5748519.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-08|
|5748520.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
|5748521.0|2016.0|   4.0|  2016-04-30| 438.0|    LOS|20574.0|     WA|    2016-05-14|
+---------+------+------+------------+------+-------+-------+-------+--------------+
only showing top 5 rows



In [57]:
#write to parquet
i94immi_df.write.parquet("sas_data","overwrite")

Py4JJavaError: An error occurred while calling o100.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:566)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 84.0 failed 1 times, most recent failure: Lost task 0.0 in stage 84.0 (TID 1275, localhost, executor driver): org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:257)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: File file:/home/workspace/sas_data/part-00013-b9542815-7a8d-45fc-9c67-c9c5007ad0d4-c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:244)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:248)
	... 10 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 33 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:257)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.FileNotFoundException: File file:/home/workspace/sas_data/part-00013-b9542815-7a8d-45fc-9c67-c9c5007ad0d4-c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:244)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:248)
	... 10 more
