In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import when


# Load environment variables from .env file
load_dotenv()

# Get AWS credentials from environment variables
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("trans table data cleansing") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.master", "spark://spark-master-2:7077") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.sql.shuffle.partitions", "62") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Define schema for the trans.csv file (match it to your CSV structure)
schema = StructType([
    StructField("trans_id", IntegerType(), True),
    StructField("account_id", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("type", StringType(), True),
    StructField("operation", StringType(), True),
    StructField("amount", FloatType(), True),
    StructField("balance", FloatType(), True),
    StructField("k_symbol", StringType(), True),
    StructField("branch", StringType(), True),
    StructField("bank", StringType(), True),
    StructField("account", IntegerType(), True)
])

# Read the CSV file from the S3 bucket, use the first row as header
df = spark.read \
    .option("delimiter", ";") \
    .option("header", "true") \
    .schema(schema) \
    .csv("s3a://nmourmx-scigility/Bronze/trans/trans.csv")  # Path to trans.csv inside the Bronze folder

# Cache the DataFrame to improve performance
df = df.cache()

# Show the first few rows to verify the data
df.show(20)

# Persist the DataFrame with MEMORY_AND_DISK storage level (useful for large data)
df = df.persist(StorageLevel.MEMORY_AND_DISK)

# Replace the typo in the 'type' column: replace 'PRJIEM' with 'PRIJEM'
df_fixed = df.withColumn("type", 
                        when(df["type"] == "PRJIEM", "PRIJEM").otherwise(df["type"]))


# Show the rows with the corrected type
df_fixed.show(20)
trans_count = df_fixed.count()
# Print the count of rows in each DataFrame
print(f"Number of rows in 'trans_df': {trans_count}")
# Select distinct values from the 'type' column


distinct_types = df_fixed.select("type").distinct()

# Show the distinct values
distinct_types.show()

# Save the fixed DataFrame to S3 in the Silver folder as Parquet
df_fixed.write \
    .mode("overwrite") \
    .parquet("s3a://nmourmx-scigility/Silver/trans_fixed_parquet/")

# # Save the distinct values of type to Parquet as well
# distinct_types.write \
#     .mode("overwrite") \
#     .parquet("s3a://nmourmx-scigility/Silver/trans_distinct_fixed_types/")

# Stop the Spark session
spark.stop()


25/08/01 20:43:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 20:43:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/01 20:43:38 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+--------+----------+----------+------+--------------+-------+-------+--------+------+----+--------+
|trans_id|account_id|      date|  type|     operation| amount|balance|k_symbol|branch|bank| account|
+--------+----------+----------+------+--------------+-------+-------+--------+------+----+--------+
|  967842|      1532|1998-05-31|PRIJEM|          NULL|  253.0|62661.0|    UROK|    AR|NULL|       0|
|  271012|      1499|1998-01-09|PRIJEM|         VKLAD| 5500.0|49790.0|    NULL|    AR|NULL|       0|
|  971490|      1645|1996-11-30|PRIJEM|          NULL|   43.0|19460.0|    UROK|    AR|NULL|       0|
|  605699|      3366|1996-10-28| VYDAJ|         VYBER| 9300.0|38433.0|    NULL|    AR|NULL|       0|
|  443107|      2471|1998-07-30| VYDAJ|         VYBER| 7800.0|56237.0|    NULL|    AR|NULL|       0|
|  627433|      3503|1995-06-07| VYDAJ|         VYBER|28640.0|41949.0|    NULL|    AR|NULL|       0|
|  390740|      2162|1997-12-03|PRIJEM|         VKLAD|11111.0|38640.0|    NULL|    AR|NULL|

25/08/01 20:44:17 WARN CacheManager: Asked to cache already cached data.


+--------+----------+----------+------+--------------+-------+-------+--------+------+----+--------+
|trans_id|account_id|      date|  type|     operation| amount|balance|k_symbol|branch|bank| account|
+--------+----------+----------+------+--------------+-------+-------+--------+------+----+--------+
|  967842|      1532|1998-05-31|PRIJEM|          NULL|  253.0|62661.0|    UROK|    AR|NULL|       0|
|  271012|      1499|1998-01-09|PRIJEM|         VKLAD| 5500.0|49790.0|    NULL|    AR|NULL|       0|
|  971490|      1645|1996-11-30|PRIJEM|          NULL|   43.0|19460.0|    UROK|    AR|NULL|       0|
|  605699|      3366|1996-10-28| VYDAJ|         VYBER| 9300.0|38433.0|    NULL|    AR|NULL|       0|
|  443107|      2471|1998-07-30| VYDAJ|         VYBER| 7800.0|56237.0|    NULL|    AR|NULL|       0|
|  627433|      3503|1995-06-07| VYDAJ|         VYBER|28640.0|41949.0|    NULL|    AR|NULL|       0|
|  390740|      2162|1997-12-03|PRIJEM|         VKLAD|11111.0|38640.0|    NULL|    AR|NULL|

                                                                                

+------+
|  type|
+------+
|PRIJEM|
| VYBER|
| VYDAJ|
+------+



                                                                                