# Work with EMR

Agenda:

1. Cataloging your data
1. Execute Apache Spark jobs with
    1. spark-submit
    1. EMR Steps: Console and CLI
1. AWS SDK for Python (Boto3)
    1. Interact with S3
    1. Interact with EMR Steps

## Cataloging your data

https://serverless-data-lake-immersionday.workshop.aws/en/lab2-cat-etl-process-data/catalog.html



## Execute Apache Spark jobs

Initialize SparkSession

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-sample').getOrCreate()

In [None]:
spark.catalog.setCurrentDatabase('default')

df = spark.read.table('tripdata')

df.printSchema()

In [None]:
df.show(1, vertical=True)

# spark.sql('select * from tripdata).show(1, vertical=True)

In [None]:
from pyspark.sql.functions import unix_timestamp, col, from_unixtime

df = df\
    .withColumn(
        'lpep_pickup_datetime', 
        from_unixtime(unix_timestamp(col('lpep_pickup_datetime'), 'M/d/yy H:mm'))
    ).withColumn(
        'lpep_dropoff_datetime', 
        from_unixtime(unix_timestamp(col('lpep_dropoff_datetime'), 'M/d/yy H:mm'))
    )

df.printSchema()
df.show(5)

In [None]:
from pyspark.sql.functions import lit, count, when

df.select([
    count(
        when(col(c).isNull(), lit(1))
    ).alias(c)
    for c in df.columns
]).show(vertical=True)

In [None]:
from pyspark.sql.functions import isnull

null_metadata = dict()

for column in df.columns:
    count = df.filter(isnull(col(column))).count()
    if count > 0 and df.schema[column].dataType.typeName() == 'string':
        null_metadata[column] = '__NULL__'
    elif count > 0 and df.schema[column].dataType.typeName() in ['integer', 'double']:
        null_metadata[column] = -1

print(null_metadata)

df = df.na.fill(null_metadata)
df.show(1, vertical=True)

In [None]:
from pyspark.sql.functions import year, month, dayofmonth, hour

df = df\
    .withColumn('_year', year(col('lpep_pickup_datetime')))\
    .withColumn('_month', month(col('lpep_pickup_datetime')))\
    .withColumn('_day', dayofmonth(col('lpep_pickup_datetime')))\
    .withColumn('_hour', hour(col('lpep_pickup_datetime')))

In [None]:
df.write.parquet(
    'parquet/tripdata',
    mode='overwrite',
    partitionBy=['_year', '_month', '_day', '_hour'],
    compression='snappy'
)

In [None]:
spark.sql('CREATE DATABASE IF NOT EXISTS curated')
df.write\
    .partitionBy('_year', '_month', '_day', '_hour')\
    .format('parquet')\
    .option('path', 'parquet/tripdata')\
    .mode('overwrite')\
    .saveAsTable('curated.tripdata')

In [None]:
databases = spark.catalog.listDatabases()

print('Show databases')
for db in databases:
    print(db, '\n')

print('\n\n')
tables = spark.catalog.listTables('curated')

print('Show tables')
for tb in tables:
    print(tb, '\n')