In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import avg, desc, broadcast, col
# Load environment variables from .env file
load_dotenv()

# Get AWS credentials
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("average loan amount per district") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.master", "spark://spark-master-2:7077") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.sql.shuffle.partitions", "62") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Define schema for loan.csv
loan_schema = StructType([
    StructField("loan_id", IntegerType(), True),
    StructField("account_id", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("amount", IntegerType(), True),
    StructField("duration", IntegerType(), True),
    StructField("payments", FloatType(), True),
    StructField("status", StringType(), True)
])

# Define schema for account.csv
account_schema = StructType([
    StructField("account_id", IntegerType(), True),
    StructField("district_id", IntegerType(), True),
    StructField("frequency", StringType(), True),
    StructField("date", StringType(), True)
])

# Define schema for district.csv
district_schema = StructType([
    StructField("district_id", IntegerType(), True),
    StructField("name", StringType(), True),
    # Add more fields if needed
])

# Read loan.csv
loan_df = spark.read \
    .option("delimiter", ";") \
    .option("header", "true") \
    .schema(loan_schema) \
    .csv("s3a://nmourmx-scigility/Bronze/loan/loan.csv")

# Read account.csv
account_df = spark.read \
    .option("delimiter", ";") \
    .option("header", "true") \
    .schema(account_schema) \
    .csv("s3a://nmourmx-scigility/Bronze/account/account.csv")

# Read district.csv
district_df = spark.read \
    .option("delimiter", ";") \
    .option("header", "true") \
    .schema(district_schema) \
    .csv("s3a://nmourmx-scigility/Bronze/district/district.csv")



# Join loan -> account -> district
loan_account_df = loan_df.join(account_df, on="account_id", how="inner")
loan_account_district_df = loan_account_df.join(broadcast(district_df), on="district_id", how="inner")



# Compute average loan amount per district name
avg_df = loan_account_district_df.groupBy("district_id", "name").agg(
    avg("amount").alias("average_loan_amount")
)


# Sort result by average_loan_amount descending
# sorted_avg_df = avg_df.orderBy(col("average_loan_amount").desc())
sorted_avg_df = avg_df.orderBy(col("district_id").desc())


# Show top 20 districts by average loan amount
sorted_avg_df.show(20, truncate=False)


# Repartition and sort within partitions by district_id
final_df = sorted_avg_df \
    .repartition(10, "district_id") \
    .sortWithinPartitions("district_id")

# Write sorted, partitioned data to S3 in Parquet format
final_df.write \
    .mode("overwrite") \
    .parquet("s3a://nmourmx-scigility/Silver/loan_avg_by_district_named_sorted/")

# Stop Spark session
spark.stop()


25/08/01 20:46:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 20:46:59 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+-----------+----------------+-------------------+
|district_id|name            |average_loan_amount|
+-----------+----------------+-------------------+
|77         |Vsetin          |81830.0            |
|76         |Sumperk         |165928.8           |
|75         |Prerov          |173907.0           |
|74         |Ostrava - mesto |163011.0           |
|73         |Opava           |186634.5           |
|72         |Olomouc         |163399.7142857143  |
|71         |Novy Jicin      |115728.0           |
|70         |Karvina         |127492.5           |
|69         |Jesenik         |242304.0           |
|68         |Frydek - Mistek |128359.5           |
|67         |Bruntal         |212966.0           |
|66         |Zdar nad Sazavou|189505.7142857143  |
|65         |Znojmo          |157396.0           |
|64         |Zlin            |177221.64705882352 |
|63         |Vyskov          |111027.42857142857 |
|62         |Uherske Hradiste|168204.0           |
|61         |Trebic          |1

                                                                                