In [2]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.storagelevel import StorageLevel

# Load environment variables from .env file
load_dotenv()

# Get AWS credentials from environment variables
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("CSV to Parquet spark master 3") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .master("spark://spark-master-3:7077") \
    .getOrCreate()

# Define schema for the CSV file
schema = StructType([
    StructField("card_id", IntegerType(), True),
    StructField("disp_id", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("issued", StringType(), True)
])

# Read the CSV file from the S3 bucket, use first row as header
df = spark.read \
    .option("delimiter", ";") \
    .option("header", "true") \
    .schema(schema) \
    .csv("s3a://nmourmx-scigility/Bronze/card/card.csv")  # Path to card.csv inside the card folder

# Cache the DataFrame to improve performance
df = df.cache()

# Show the first few rows to verify the data
df.show(20)

# Convert the 'issued' column to a proper date type
df = df.withColumn("issued", df["issued"].cast(DateType()))

# Persist the DataFrame with MEMORY_AND_DISK storage level (useful for large data)
df = df.persist(StorageLevel.MEMORY_AND_DISK)

# Save the DataFrame to S3 in the Silver folder as Parquet
df.write \
    .mode("overwrite") \
    .parquet("s3a://nmourmx-scigility/Silver/card_parquet/")

# Stop the Spark session
spark.stop()


+-------+-------+-------+----------+
|card_id|disp_id|   type|    issued|
+-------+-------+-------+----------+
|      1|      9|   gold|1998-10-16|
|      2|     19|classic|1998-03-13|
|      3|     41|   gold|1995-09-03|
|      4|     42|classic|1998-11-26|
|      5|     51| junior|1995-04-24|
|      7|     56|classic|1998-06-11|
|      8|     60| junior|1998-05-20|
|      9|     76|classic|1997-10-25|
|     10|     77|classic|1996-12-07|
|     11|     79|   gold|1997-10-25|
|     12|     83| junior|1996-09-11|
|     13|     87|classic|1994-06-29|
|     14|    112|classic|1996-02-17|
|     15|    114|classic|1995-03-05|
|     16|    116|classic|1998-06-23|
|     17|    127|classic|1998-06-07|
|     18|    128|classic|1995-08-25|
|     19|    130|classic|1997-09-09|
|     20|    131|classic|1998-12-02|
|     21|    132|classic|1998-02-26|
+-------+-------+-------+----------+
only showing top 20 rows



                                                                                