In [2]:
from typing import List
import os

from pyspark.sql import DataFrame
from pyspark.sql.functions import to_timestamp

from pyspark.sql import SparkSession, SQLContext

In [3]:
# Função de Transformação do trex-dags

def transform_columns_to_timestamp(
    df: DataFrame, timestamp_fields: List[str]
) -> DataFrame:
    for timestamp_field in timestamp_fields:
        df = df.withColumn(f"{timestamp_field}_ts", to_timestamp(timestamp_field))
        df = df.drop(timestamp_field)
        df = df.withColumnRenamed(f"{timestamp_field}_ts", timestamp_field)

    return df


In [11]:
MAX_MEMORY = "5g"

spark = SparkSession \
    .builder \
    .appName("Pagar.me Data Cluster") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()


21/08/31 11:17:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [9]:
input_path = os.path.join(".", "teste")
output_path = os.path.join(".", "output_spark", "processed_files")

if os.path.isdir(output_path):
    os.rmdir(output_path)

timestamp_fields = [
    "created_at",
    "updated_at",
    "boleto_expiration_date",
    "local_time",
    "pix_expiration_date"
]


In [13]:
df = spark.read.parquet(input_path)

transformed_df = transform_columns_to_timestamp(df, timestamp_fields)

transformed_df.repartition(100).write.option("compression", "snappy").parquet(output_path)

21/08/31 11:18:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [19]:
df_processed = spark.read.parquet(output_path)

In [20]:
# Show columns after transforming in to datetime
[(i, v) for i, v in df_processed.dtypes if i in timestamp_fields]

[('created_at', 'timestamp'),
 ('updated_at', 'timestamp'),
 ('boleto_expiration_date', 'timestamp'),
 ('local_time', 'timestamp'),
 ('pix_expiration_date', 'timestamp')]

In [21]:
# Show columns before transforming in to datetime
[(i, v) for i, v in df.dtypes if i in timestamp_fields]

[('boleto_expiration_date', 'string'),
 ('created_at', 'string'),
 ('updated_at', 'string'),
 ('local_time', 'string'),
 ('pix_expiration_date', 'string')]