In [0]:
from pyspark.sql.functions import col, year, month, dayofmonth, date_format, sum, avg, count, desc, round, when, expr, lit
from pyspark.sql.window import Window

# ---------------------------------------------
# 1. Azure storage path definitions
# ---------------------------------------------
silver_base_path = "abfss://kiva-silver@kivastorageacc2.dfs.core.windows.net/kiva-data-clean"
golden_base_path = "abfss://kiva-gold@kivastorageacc2.dfs.core.windows.net/kiva-analytics"

# ---------------------------------------------
# 2. Storage account configuration
# ---------------------------------------------
storage_account_name = "kivastorageacc2"
storage_account_key = "YourStorageAccountKeyHere"  # Replace with your actual storage account key

spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", 
    storage_account_key
)


In [0]:
# ---------------------------------------------
# 3. Read silver data
# ---------------------------------------------
df_silver = spark.read.option("recursiveFileLookup", "true").parquet(silver_base_path)

In [0]:
# ---------------------------------------------
# 4. Data transformations for gold layer
# ---------------------------------------------

# 4.1. County based loan summary
country_summary = (
    df_silver
    .groupBy("country")
    .agg(
        count("id").alias("total_loans"),
        round(sum("loan_amount"), 2).alias("total_loan_amount"),
        round(avg("loan_amount"), 2).alias("avg_loan_amount")
    )
    .orderBy(desc("total_loans"))
)


In [0]:
# 4.2. Date based trend analyse
date_summary = (
    df_silver
    .withColumn("loan_date", date_format("posted_time", "yyyy-MM-dd"))
    .groupBy("loan_date")
    .agg(
        count("id").alias("daily_loan_count"),
        round(sum("loan_amount"), 2).alias("daily_loan_amount")
    )
    .orderBy("loan_date")
)


In [0]:
# 4.3. Sector based loans
sector_summary = (
    df_silver
    .groupBy("sector")
    .agg(
        count("id").alias("sector_loan_count"),
        round(sum("loan_amount"), 2).alias("sector_loan_amount"),
        round(avg("loan_amount"), 2).alias("sector_avg_loan")
    )
    .orderBy(desc("sector_loan_count"))
)

In [0]:
# 4.4. Loan distrubution by gender
gender_summary = (
    df_silver
    .groupBy("borrower_genders")  
    .agg(
        count("id").alias("gender_loan_count"),
        round(sum("loan_amount"), 2).alias("gender_loan_amount"),
        round(avg("loan_amount"), 2).alias("gender_avg_loan")
    )
)

In [0]:
# 4.5. Risk evaluation 
risk_analysis = (
    df_silver
    .withColumn(
        "risk_score", 
        when(col("loan_amount") < 500, "Low")
        .when(col("loan_amount").between(500, 2000), "Medium")
        .otherwise("High")
    )
    .groupBy("risk_score")
    .agg(
        count("id").alias("risk_loan_count"),
        round(sum("loan_amount"), 2).alias("risk_loan_amount")
    )
)


In [0]:
# ---------------------------------------------
# 5. Writing data to the gold layer
# ---------------------------------------------

# 5.1. Country based
country_summary.write.mode("overwrite").parquet(f"{golden_base_path}/country_analysis")

# 5.2. Date trend
date_summary.write.mode("overwrite").parquet(f"{golden_base_path}/date_analysis")

# 5.3. Sector analysis
sector_summary.write.mode("overwrite").parquet(f"{golden_base_path}/sector_analysis")

In [0]:
# 5.4. Gender analysis
if "borrower_gender" in df_silver.columns:
    gender_summary.write.mode("overwrite").parquet(f"{golden_base_path}/gender_analysis")

# 5.5. Risk analysis
risk_analysis.write.mode("overwrite").parquet(f"{golden_base_path}/risk_analysis")

In [0]:
# ---------------------------------------------
# 6. Creating a Summary Table
# ---------------------------------------------
summary_view = (
    df_silver
    .withColumn("year", year("posted_time"))
    .withColumn("month", month("posted_time"))
    .withColumn("risk_category", when(col("loan_amount") < 500, "Low")
                                .when(col("loan_amount").between(500, 2000), "Medium")
                                .otherwise("High"))
    .groupBy("country", "sector", "year", "month", "risk_category")
    .agg(
        count("id").alias("num_loans"),
        round(sum("loan_amount"), 2).alias("total_amount"),
        round(avg("loan_amount"), 2).alias("avg_amount")
    )
)

In [0]:
# Save the summary table    
summary_view.write.mode("overwrite").partitionBy("year", "month").parquet(f"{golden_base_path}/summary_view")

print("Gold katman başarıyla oluşturuldu!")

# We can choose Delta format (if Delta Lake loaded)
# summary_view.write.format("deltas").mode("overwrite").partitionBy("year", "month").save(f"{golden_base_path}/summary_view_delta")

Golden katman başarıyla oluşturuldu!
