In [None]:
# Helpful commands for glue interactive sessions
#%help
#%stop_session
#%status

In [None]:
# initialize glue interactive session
# https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions-magics.html
%iam_role "arn:aws:iam::123456789012:role/glue_etl_role_dev"
%session_id_prefix "my-app"
%glue_version 4.0
%number_of_workers 2
%worker_type "G.1X"
%idle_timeout 60

In [None]:
%%tags
{
    "Application": "my-app",
    "Environment": "dev"
}

In [None]:
# Initialize the glue job
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import year, month, dayofmonth, current_date, udf
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql import SparkSession

# Create a GlueContext
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)

# Initialize the job
job = Job(glueContext)
args = {'JOB_NAME': 'my-glue-example-job-dev'}
job.init(args['JOB_NAME'], args)

# Get the spark session
spark = glueContext.spark_session

In [None]:
# read file from S3 bucket
datasource0 = glueContext.create_dynamic_frame.from_options(
    connection_type="s3", 
    connection_options={
        "paths": ["s3://elnaterator-aws-glue-example-app-dev/users.csv"]
    }, 
    format="csv", 
    format_options={
        "withHeader": True, 
        "separator": ","
    }
)

datasource0.show()

In [None]:
# Create a fullname column
def create_fullname(firstname, lastname):
    return firstname + " " + lastname

create_fullname_udf = udf(create_fullname, StringType())

# Convert to a spark dataframe
df = datasource0.toDF()

# Add a fullname column
df = df.withColumn("fullname", create_fullname_udf(df.firstname, df.lastname))

# Show the dataframe
df.show()

In [None]:
# Convert back to a dynamic frame
datasource1 = DynamicFrame.fromDF(df, glueContext, "datasource1")

# Write the dynamic frame to S3
glueContext.write_dynamic_frame.from_options(
    frame=datasource1, 
    connection_type="s3", 
    connection_options={
        "path": "s3://elnaterator-aws-glue-example-app-dev/users-with-fullname.csv"
    }, 
    format="csv", 
    format_options={
        "withHeader": True, 
        "separator": ","
    }
)