In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import max, min

class ParquetPipeline:
    def __init__(self, app_name: str = "MultiParquetPipeline"):
        load_dotenv()
        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

        self.spark = SparkSession.builder \
            .appName(app_name) \
            .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id) \
            .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
            .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
            .config("spark.master", "spark://spark-master-2:7077") \
            .config("spark.hadoop.fs.s3a.fast.upload", "true") \
            .config("spark.sql.shuffle.partitions", "62") \
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
            .getOrCreate()
        print(f"[INFO] Spark session started: {app_name}")
            # .appName("average loan amount per district") \
    @staticmethod
    def extract_name(path: str) -> str:
        return path.strip('/').split('/')[-1].replace('_parquet', '')

    def read_parquet(self, path: str) -> DataFrame:
        print(f"[INFO] Reading Parquet data from: {path}")
        df = self.spark.read.parquet(path)
        return df.cache()

    def report(self, df: DataFrame, name: str):
        print(f"\n--- Dataset: {name} ---")
        df.show(5)
        total_rows = df.count()
        print(f"[INFO] Total rows in {name}: {total_rows}")

    def process_all(self, parquet_paths: list):
        for path in parquet_paths:
            name = self.extract_name(path)
            df = self.read_parquet(path)
            self.report(df, name)

    def stop(self):
        print("[INFO] Stopping Spark session")
        self.spark.stop()


if __name__ == "__main__":
    parquet_paths = [
        "s3a://nmourmx-scigility/Silver/account_parquet/",
        "s3a://nmourmx-scigility/Silver/card_parquet/",
        "s3a://nmourmx-scigility/Silver/client_parquet/",
        "s3a://nmourmx-scigility/Silver/disp_parquet/",
        "s3a://nmourmx-scigility/Silver/district_parquet/",
        "s3a://nmourmx-scigility/Silver/loan_parquet/",
        "s3a://nmourmx-scigility/Silver/order_parquet/",
        "s3a://nmourmx-scigility/Silver/trans_fixed_parquet/"
    ]

    pipeline = ParquetPipeline()
    pipeline.process_all(parquet_paths)
    pipeline.stop()


25/08/01 20:48:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


[INFO] Spark session started: MultiParquetPipeline
[INFO] Reading Parquet data from: s3a://nmourmx-scigility/Silver/account_parquet/


25/08/01 20:48:23 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                


--- Dataset: account ---


                                                                                

+----------+-----------+----------------+----------+
|account_id|district_id|       frequency|      date|
+----------+-----------+----------------+----------+
|         1|         18|POPLATEK MESICNE|1995-03-24|
|         2|          1|POPLATEK MESICNE|1993-02-26|
|         3|          5|POPLATEK MESICNE|1997-07-07|
|         4|         12|POPLATEK MESICNE|1996-02-21|
|         5|         15|POPLATEK MESICNE|1997-05-30|
+----------+-----------+----------------+----------+
only showing top 5 rows

[INFO] Total rows in account: 4500
[INFO] Reading Parquet data from: s3a://nmourmx-scigility/Silver/card_parquet/


                                                                                


--- Dataset: card ---
+-------+-------+-------+----------+
|card_id|disp_id|   type|    issued|
+-------+-------+-------+----------+
|      1|      9|   gold|1998-10-16|
|      2|     19|classic|1998-03-13|
|      3|     41|   gold|1995-09-03|
|      4|     42|classic|1998-11-26|
|      5|     51| junior|1995-04-24|
+-------+-------+-------+----------+
only showing top 5 rows

[INFO] Total rows in card: 892
[INFO] Reading Parquet data from: s3a://nmourmx-scigility/Silver/client_parquet/


                                                                                


--- Dataset: client ---


                                                                                

+---------+------+----------+-----------+
|client_id|gender|birth_date|district_id|
+---------+------+----------+-----------+
|        1|     F|1970-12-13|         18|
|        2|     M|1945-02-04|          1|
|        3|     F|1940-10-09|          1|
|        4|     M|1956-12-01|          5|
|        5|     F|1960-07-03|          5|
+---------+------+----------+-----------+
only showing top 5 rows

[INFO] Total rows in client: 5369
[INFO] Reading Parquet data from: s3a://nmourmx-scigility/Silver/disp_parquet/

--- Dataset: disp ---


                                                                                

+-------+---------+----------+---------+
|disp_id|client_id|account_id|     type|
+-------+---------+----------+---------+
|      1|        1|         1|    OWNER|
|      2|        2|         2|    OWNER|
|      3|        3|         2|DISPONENT|
|      4|        4|         3|    OWNER|
|      5|        5|         3|DISPONENT|
+-------+---------+----------+---------+
only showing top 5 rows

[INFO] Total rows in disp: 5369
[INFO] Reading Parquet data from: s3a://nmourmx-scigility/Silver/district_parquet/

--- Dataset: district ---
+-----------+-------------+---------------+----------+---+---+---+---+---+----+-----+----+----+-----+-----+-----+----+----+----+
|district_id|district_name|         region|population| A2| A3| A4| A5| A6|  A7|   A8|  A9| A10|  A11|  A12|  A13| A14| A15| A16|
+-----------+-------------+---------------+----------+---+---+---+---+---+----+-----+----+----+-----+-----+-----+----+----+----+
|          1|  Hl.m. Praha|         Prague|   1204953|  0|  0|  0|  1|  1|NUL

                                                                                

+--------+----------+----------+------+---------+------+-------+--------+------+----+-------+
|trans_id|account_id|      date|  type|operation|amount|balance|k_symbol|branch|bank|account|
+--------+----------+----------+------+---------+------+-------+--------+------+----+-------+
|  967842|      1532|1998-05-31|PRIJEM|     NULL| 253.0|62661.0|    UROK|    AR|NULL|      0|
|  271012|      1499|1998-01-09|PRIJEM|    VKLAD|5500.0|49790.0|    NULL|    AR|NULL|      0|
|  971490|      1645|1996-11-30|PRIJEM|     NULL|  43.0|19460.0|    UROK|    AR|NULL|      0|
|  605699|      3366|1996-10-28| VYDAJ|    VYBER|9300.0|38433.0|    NULL|    AR|NULL|      0|
|  443107|      2471|1998-07-30| VYDAJ|    VYBER|7800.0|56237.0|    NULL|    AR|NULL|      0|
+--------+----------+----------+------+---------+------+-------+--------+------+----+-------+
only showing top 5 rows



                                                                                

[INFO] Total rows in trans_fixed: 1056410
[INFO] Stopping Spark session
