# Develop Apache Spark with EMR Notebook

Prepare: [Cataloging your data](https://serverless-data-lake-immersionday.workshop.aws/en/lab2-cat-etl-process-data/catalog.html)

## Develop Apache Spark with EMR

Initialize SparkSession

In [None]:
### unnecessary for EMR notebook
# from pyspark.sql import SparkSession
# spark = SparkSession.builder\
#     .enableHiveSupport()\
#     .appName('pyspark-sample').getOrCreate()

In [None]:
source_database = ''
source_table = ''

spark.catalog.setCurrentDatabase(source_database)

df = spark.read.table(source_table)

df.printSchema()

In [None]:
df.show(1, vertical=True)

In [None]:
from pyspark.sql.functions import unix_timestamp, col, from_unixtime

df = df\
    .withColumn(
        'lpep_pickup_datetime', 
        from_unixtime(unix_timestamp(col('lpep_pickup_datetime'), 'M/d/yy H:mm'))
    ).withColumn(
        'lpep_dropoff_datetime', 
        from_unixtime(unix_timestamp(col('lpep_dropoff_datetime'), 'M/d/yy H:mm'))
    )

df.printSchema()
df.show(5)

In [None]:
from pyspark.sql.functions import lit, count, when

df.select([
    count(
        when(col(c).isNull(), lit(1))
    ).alias(c)
    for c in df.columns
]).show(vertical=True)

In [None]:
df = df.na.drop(how='any', subset=['lpep_pickup_datetime', 'lpep_dropoff_datetime'])

In [None]:
from pyspark.sql.functions import isnull

null_metadata = dict()

for column in df.columns:
    count = df.filter(isnull(col(column))).count()
    if count > 0 and df.schema[column].dataType.typeName() == 'string':
        null_metadata[column] = '__NULL__'
    elif count > 0 and df.schema[column].dataType.typeName() in ['integer', 'double']:
        null_metadata[column] = -1

print(null_metadata)

df = df.na.fill(null_metadata)

In [None]:
df.show(1, vertical=True)

In [None]:
from pyspark.sql.functions import year, month, dayofmonth, hour

df = df\
    .withColumn('_year', year(col('lpep_pickup_datetime')))\
    .withColumn('_month', month(col('lpep_pickup_datetime')))\
    .withColumn('_day', dayofmonth(col('lpep_pickup_datetime')))\
    .withColumn('_hour', hour(col('lpep_pickup_datetime')))

In [None]:
target_database = ''
target_table = ''
target_host = ''

In [None]:
### Directly write to S3 without register catalog
# df.write.parquet(
#     f'{target_host}/parquet/{target_table}',
#     mode='overwrite',
#     partitionBy=['_year', '_month', '_day', '_hour'],
#     compression='snappy'
# )

In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {target_database}")
df.write\
    .partitionBy('_year', '_month', '_day', '_hour')\
    .format('parquet')\
    .option('path', f'{target_host}/parquet/{target_table}')\
    .mode('overwrite')\
    .saveAsTable(f'{target_database}.{target_table}')

In [None]:
tables = spark.catalog.listTables(target_database)

print('Show tables')
for tb in tables:
    print(tb, '\n')