In [1]:
% % configure -f
{
    "conf": {
        "spark.sql.legacy.parquet.datetimeRebaseModeInRead": "LEGACY",
        "spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "LEGACY"
    }
}

In [2]:
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from pyspark.sql import DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

from typing import Iterable

sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)

spark = glueContext.spark_session

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
crf_demographics_df = spark.read.option("recursiveFileLookup", "true").load(
    path='s3://cvm-develop/data_extracts/dev/sample_V2/demographics/crf',
    format='parquet'
)

share_demographics_df = spark.read.option("recursiveFileLookup", "true").load(
    path='s3://cvm-develop/data_extracts/dev/sample_V2/demographics/share',
    format='parquet'
)

gcr_demographics_df = spark.read.option("recursiveFileLookup", "true").load(
    path='s3://cvm-develop/data_extracts/dev/sample_V2/demographics/gcr',
    format='parquet'
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
crf_demographics_df = crf_demographics_df.withColumn('bu', lit('crf'))
share_demographics_df = share_demographics_df.withColumn('bu', lit('share'))
gcr_demographics_df = gcr_demographics_df.withColumn('bu', lit('gcr'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
demographics_df = crf_demographics_df.union(share_demographics_df).union(gcr_demographics_df) \
    .withColumn('ingestion_date', current_date().cast(StringType()))

for c in demographics_df.columns:
    demographics_df = demographics_df.withColumnRenamed(c, c.lower())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
for c in ['cod_sor_counterparty', 'cod_sor_owner', 'cod_sor_owner_2']:
    demographics_df = demographics_df.withColumn(c, col(c).cast(StringType()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
#  covert string to days
for c in ['dat_snapshot',
          'dat_person_optin',
          'dat_person_telephonenumber_optin',
          'dat_person_telephonenumber_optin_2',
          'dat_person_email_optin',
          'dat_person_sms_optin',
          'dat_of_birth', ]:
    demographics_df = demographics_df.withColumn(c, to_date(c))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
demographics_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- dat_snapshot: date (nullable = true)
 |-- idi_counterparty: string (nullable = true)
 |-- cod_sor_counterparty: string (nullable = true)
 |-- cdi_counterparty_role: string (nullable = true)
 |-- des_counterparty_role: string (nullable = true)
 |-- cdi_counterparty_status: string (nullable = true)
 |-- des_counterparty_status: string (nullable = true)
 |-- cde_country_residence: string (nullable = true)
 |-- nam_person_fullname: string (nullable = true)
 |-- nam_person_firstname: string (nullable = true)
 |-- nam_person_initials: string (nullable = true)
 |-- nam_person_lastname: string (nullable = true)
 |-- ind_person_gender: string (nullable = true)
 |-- nam_person_titles: string (nullable = true)
 |-- nam_person_profession: string (nullable = true)
 |-- ind_person_optin: string (nullable = true)
 |-- dat_person_optin: date (nullable = true)
 |-- ide_person_telephonenumber: string (nullable = true)
 |-- ind_person_telephonenumber_optin: string (nullable = true)
 |-- dat_per

In [11]:
demographics_df.write\
    .mode('overwrite')\
    .partitionBy('bu', 'ingestion_date')\
    .parquet('s3://cvm-landing-a6623c3/demographics')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…