In [7]:
# Script này dùng để tính toán tỷ lệ phân bổ phí bảo hiểm (earned rate và UPR rate) theo từng ngày 
# giữa ngày cấp đơn và ngày hết hạn, sau đó ghi kết quả vào bảng lưu trữ Delta Table.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, datediff, expr, current_timestamp, explode, sequence, to_date, when, year, coalesce
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import broadcast

def get_spark_session():
    spark = SparkSession.builder \
        .appName("MotorDataEarnedCalculate") \
        .config("spark.sql.shuffle.partitions", "200") \
        .config("spark.default.parallelism", "200") \
        .getOrCreate()
    return spark

spark = get_spark_session()

try:
    raw_data = spark.read.format("delta").table("raw_motor_data").cache()
except Exception as e:
    print(f"Error reading raw_motor_data: {str(e)}")
    raise Exception("Please ensure raw_motor_data exists in the Lakehouse.")

std_data = raw_data.withColumn(
    "P_DATE_ISSUE", col("std_v_date_issue")
).withColumn(
    "P_DATE_EXPIRY",
    when(
        (col("std_v_date_expiry") <= col("std_v_date_issue")) |
        (datediff(col("std_v_date_expiry"), col("std_v_date_issue")) / 365 > 15),
        F.date_add(col("std_v_date_issue"), 1)
    ).otherwise(col("std_v_date_expiry"))
)

std_days = std_data.withColumn(
    "DAY_SPLIT",
    explode(
        sequence(
            to_date(col("P_DATE_ISSUE")), 
            to_date(col("P_DATE_EXPIRY")), 
            expr("interval 1 day")
        )
    )
)

std_final = std_days.withColumn(
    "PERCENT_PREMIUM",
    (1.0 / datediff(col("P_DATE_EXPIRY"), col("P_DATE_ISSUE")))
)

window_spec = Window.partitionBy("P_DATE_ISSUE", "P_DATE_EXPIRY").orderBy("DAY_SPLIT")

std_final = std_final.withColumn(
    "CUMULATIVE_PERCENT_PREMIUM",
    F.sum("PERCENT_PREMIUM").over(window_spec.rowsBetween(Window.unboundedPreceding, 0))
).withColumn(
    "PERCENT_PREMIUM_ADJ",
    when(
        col("DAY_SPLIT") == col("P_DATE_EXPIRY"),
        1.0 - F.sum("PERCENT_PREMIUM").over(window_spec.rowsBetween(Window.unboundedPreceding, -1))
    ).otherwise(
        col("PERCENT_PREMIUM")
    )
).withColumn(
    "PERCENT_PREMIUM_ADJ",
    when(col("PERCENT_PREMIUM_ADJ") < 0, lit(0.0)).otherwise(col("PERCENT_PREMIUM_ADJ"))
)

std_final = std_final.withColumn(
    "UPR_RATE",
    when(
        col("DAY_SPLIT") == col("P_DATE_EXPIRY"),
        lit(0.0)
    ).otherwise(
        F.greatest(
            lit(0.0),
            1.0 - F.sum("PERCENT_PREMIUM_ADJ").over(window_spec.rowsBetween(Window.unboundedPreceding, 0))
        )
    )
).withColumn(
    "UPR_RATE",
    when(col("DAY_SPLIT") == col("P_DATE_EXPIRY"), lit(0.0)).otherwise(col("UPR_RATE"))
)

final_data = std_final.select(
    current_timestamp().cast('date').alias("date_data"),
    col("std_v_date_issue"),
    col("std_v_date_expiry"),
    col("DAY_SPLIT").cast("date").alias("earn_date"),
    col("PERCENT_PREMIUM_ADJ").cast("double").alias("EARNED_RATE"),
    col("UPR_RATE").cast("double").alias("UPR_RATE")
)

final_data.repartition(200) \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("A6_MOTOR_DATE_EARNED_RATE")

raw_data.unpersist()

StatementMeta(, 760acdad-592c-47d2-81a6-920f0c22a58b, 9, Finished, Available, Finished)

DataFrame[std_v_date_issue: date, std_v_date_expiry: date]