In [None]:
%pyspark
spark.version

### Import necessary libraries

In [None]:
from pyspark.sql.functions import regexp_replace, regexp_extract
from pyspark.sql.types import StringType, FloatType, IntegerType

### Get data from S3 and Glue Data Catalog

In [None]:
df = spark.read.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.option("sep", ",")\
.load("s3://dirty-transactions-from-csv-to-parquet/dirty_transactions/dirty_transactions.csv")

In [None]:
df.show(5)

In [None]:
df_glue_table = spark.table("dirty-transactions-from-csv-to-parquet.dirty_transactions")

In [None]:
df_glue_table.show(5)

### Define UDFs

In [None]:
def extract_city_name(string):
    cleaned_string = regexp_replace(string, r'[^\w\s]', '')
    city_name = cleaned_string.strip()
    return city_name

def extract_only_numbers(string):
    numbers = regexp_extract(string, r'\d+', 0)
    return ''.join(numbers)

def extract_floats_without_sign(string):
    string_without_dollar = regexp_replace(string, r'\$', '')
    return float(string_without_dollar)

### Register UDFs

In [None]:
spark.udf.register("extract_city_name", extract_city_name, StringType())
spark.udf.register("extract_only_numbers", extract_only_numbers, IntegerType())
spark.udf.register("extract_floats_without_sign", extract_floats_without_sign, FloatType())

### Apply functions and create final clean dataframe

In [None]:
df_final = df_glue_table.selectExpr(
    "store_id",
    "extract_city_name(store_location) as store_location",
    "product_category",
    "extract_only_numbers(product_id) as product_id",
    "extract_floats_without_sign(mrp) as mrp",
    "extract_floats_without_sign(cp) as cp",
    "extract_floats_without_sign(discount) as discount",
    "extract_floats_without_sign(sp) as sp",
    "date"
)

In [None]:
df_final.show(5)

### Write final dataframe to S3 and create a corresponding Data Catalog

In [None]:
df_final.write\
.saveAsTable('dirty-transactions-from-csv-to-parquet.clean_transactions', format='parquet', mode='overwrite',
            path='s3://aws-glue-emr-from-csv-to-parquet/clean_transactions_parquet')