In [38]:
# 1. Initialization & Environment Setup

import os
import sys

# Standard Talentum/Hadoop Environment Config
os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# Packages for ORC/Avro support
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType, LongType, IntegerType, TimestampType
from pyspark.sql.window import Window

In [39]:
# Initialize Spark Session with Hive Support
spark = SparkSession.builder \
    .appName("Create_Financial_Spend_Analysis_Gold") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext

In [40]:
# --- Stability Configuration (CRITICAL for ORC/Hadoop Env) ---
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")

# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 

# 3. Disable Spark's optimizing components (Forces safer execution path)
spark.conf.set("spark.sql.cbo.enabled", "false") 
spark.conf.set("spark.sql.codegen.wholeStage", "false")

# 4. Force Hive SerDe
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [41]:
# 2. Load Silver Data
print("Loading Silver Layer tables from Hive...")

# A. Transactions
df_trans_silver = spark.table("financial_db.transactions_silver")
# B. Users
df_users_silver= spark.table("financial_db.users_silver")
# C. Cards
df_cards_silver= spark.table("financial_db.cards_silver")

#3.Verification
print(f"Transaction Count:{df_trans_silver.count()}")
print(f"Users Count: {df_users_silver.count()}")
print(f"Cards Count: {df_cards_silver.count()}")

#4.Preview Schema to ensure types are correct
df_trans_silver.printSchema()
df_users_silver.printSchema()
df_cards_silver.printSchema()

Loading Silver Layer tables from Hive...
Transaction Count:24386900
Users Count: 2000
Cards Count: 6146
root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- apa

In [47]:
df_trans_silver.show()

+-------+-------+------+--------+--------------------+-------------+--------------+-----+----+--------------------+--------+---------------------+----+-----+
|user_id|card_id|amount|use_chip|       merchant_name|merchant_city|merchant_state|  zip| mcc|              errors|is_fraud|transaction_timestamp|year|month|
+-------+-------+------+--------+--------------------+-------------+--------------+-----+----+--------------------+--------+---------------------+----+-----+
|    669|      3| 15.05|   Swipe|-8012891446218939321|    Arlington|            TX|76002|5812|                 N/A|      No|  2002-09-01 11:23:00|2002|    9|
|    884|      1| 36.00|   Swipe|-1288082279022882052|   Rock Falls|            IL|61071|5499|                 N/A|      No|  2002-09-01 07:01:00|2002|    9|
|    669|      3|-52.00|   Swipe|-1288082279022882052|    Arlington|            TX|76002|5499|                 N/A|      No|  2002-09-01 13:05:00|2002|    9|
|    884|      1| 76.21|   Swipe| 255224066395210705

In [48]:
# 3. Data Prep & Renaming (Silver -> Gold Prep)
print("--- Step 3: Preparing DataFrames for Join ---")

# --- A. Transactions Prep ---
df_trans_prep = df_trans_silver \
    .withColumnRenamed("zip", "merchant_zip") \
    .withColumn("txn_date", F.to_date(F.col("transaction_timestamp"))) \
    .withColumn("hour", F.hour(F.col("transaction_timestamp")))



--- Step 3: Preparing DataFrames for Join ---


In [49]:
# --- B. Users Prep ---
# Generate user_id via indexing to match Schema if not present, or use existing logic
# (Assuming the logic from your previous script is required to sync IDs)
rdd_with_id = df_users_silver.rdd.zipWithIndex().map(lambda x: (x[1],) + tuple(x[0]))
user_cols = ["user_id"] + df_users_silver.columns
df_users_indexed = spark.createDataFrame(rdd_with_id, user_cols)

df_users_prep = df_users_indexed \
    .withColumnRenamed("current_age", "age") \
    .withColumnRenamed("city", "user_city") \
    .withColumnRenamed("state", "user_state") \
    .withColumnRenamed("yearly_income_person", "yearly_income") \
    .withColumnRenamed("gender", "user_gender")

In [50]:
# --- C. Cards Prep ---
df_cards_prep = df_cards_silver \
    .withColumnRenamed("user", "user_id") \
    .withColumnRenamed("card_index", "card_id") \
    .withColumnRenamed("card_brand", "card_brand") \
    .withColumnRenamed("card_type", "card_type")

In [51]:
# 4. Joining Data
print("--- Step 4: Joining Transactions + Users + Cards ---")

# Join 1: Trans + Users
df_join_1 = df_trans_prep.join(df_users_prep, on="user_id", how="left")

--- Step 4: Joining Transactions + Users + Cards ---


In [52]:
# Join 2: + Cards
df_full_join = df_join_1.join(df_cards_prep, on=["user_id", "card_id"], how="left")

In [53]:
# 5. Spend Analysis Feature Engineering
print("--- Step 5: Feature Engineering for Spend Analysis ---")

df_enriched = df_full_join.withColumn(
    # A. Spending Brackets (Categorizing Transaction Size)
    "spend_tier",
    F.when(F.col("amount") < 20, "Micro_Spend")
     .when((F.col("amount") >= 20) & (F.col("amount") < 100), "Low_Value")
     .when((F.col("amount") >= 100) & (F.col("amount") < 500), "Mid_Value")
     .otherwise("High_Value")
).withColumn(
    # B. Demographics Segments (For Slicing by User Type)
    "age_group",
    F.when(F.col("age") < 25, "Gen_Z")
     .when((F.col("age") >= 25) & (F.col("age") < 40), "Millennial")
     .when((F.col("age") >= 40) & (F.col("age") < 60), "Gen_X")
     .otherwise("Boomer_Plus")
).withColumn(
    # C. Income Segments
    "income_tier",
    F.when(F.col("yearly_income") < 30000, "Low_Income")
     .when((F.col("yearly_income") >= 30000) & (F.col("yearly_income") < 80000), "Middle_Income")
     .otherwise("High_Income")
).withColumn(
    # D. Location Analysis (Domestic vs Travel Spend)
    "spend_location_type",
    F.when(F.col("merchant_state") == F.col("user_state"), "Local_Spend")
     .when(F.col("merchant_state").isNull(), "Online/Unknown")
     .otherwise("Travel_Domestic")
).withColumn(
    # E. Time Analysis (Weekend vs Weekday)
    "is_weekend",
    F.when(F.dayofweek(F.col("txn_date")).isin([1, 7]), "Weekend") # 1=Sun, 7=Sat
     .otherwise("Weekday")
)

--- Step 5: Feature Engineering for Spend Analysis ---


In [55]:
# 6. Aggregation (Creating the Gold Fact Table)
#Spend Analysis is best viewed at a 
#Daily Level grouped by Segment. This creates a "Daily Spend Fact" table.
print("--- Step 6: Aggregating to Daily Spend Fact Table ---")

df_gold_spend = df_enriched.groupBy(
    "txn_date",
    "year",
    "month",
    "age_group",
    "income_tier",
    "user_gender",
    "card_brand",
    "card_type",
    "spend_location_type",
    "mcc" # Merchant Category Code is vital for Spend Analysis
).agg(
    # 1. Volume Metrics
    F.sum("amount").alias("total_spend_amount"),
    F.count("amount").alias("transaction_count"),
    
    # 2. Average Metrics (ATV - Average Transaction Value)
    F.avg("amount").alias("avg_ticket_size"),
    
    # 3. Risk/Quality Metrics
    F.sum(F.when(F.col("is_fraud") == "Yes", F.col("amount")).otherwise(0)).alias("fraud_spend_amount"),
    
    # 4. Max Spend (to identify big ticket items in that segment)
    F.max("amount").alias("max_transaction_val")
)

--- Step 6: Aggregating to Daily Spend Fact Table ---


In [56]:
# 7. Final Formatting & Saving
print("--- Step 7: Saving Gold Table ---")

# Define Path (Using similar path structure as your reference)
gold_spend_path = "/user/talentum/projectMaster/warehouseDir/gold/financial_spend_analysis"

# Drop Table if exists to ensure clean overwrite
spark.sql("DROP TABLE IF EXISTS financial_db.spend_analysis_gold")

# Write to Hive (ORC Format)
df_gold_spend.write \
    .mode("overwrite") \
    .format("orc") \
    .option("path", gold_spend_path) \
    .option("compression", "snappy") \
    .saveAsTable("financial_db.spend_analysis_gold")

print("SUCCESS: Spend Analysis Gold Table Saved!")

--- Step 7: Saving Gold Table ---
SUCCESS: Spend Analysis Gold Table Saved!


In [57]:
# --- Preview ---
print("Previewing Data:")
df_gold_spend.orderBy(F.col("txn_date").desc()).show(10, truncate=False)

Previewing Data:
+----------+----+-----+-----------+-------------+-----------+----------+---------------+-------------------+----+------------------+-----------------+---------------+------------------+-------------------+
|txn_date  |year|month|age_group  |income_tier  |user_gender|card_brand|card_type      |spend_location_type|mcc |total_spend_amount|transaction_count|avg_ticket_size|fraud_spend_amount|max_transaction_val|
+----------+----+-----+-----------+-------------+-----------+----------+---------------+-------------------+----+------------------+-----------------+---------------+------------------+-------------------+
|2020-02-28|2020|2    |Gen_Z      |Middle_Income|Male       |Visa      |Debit (Prepaid)|Online/Unknown     |5311|25.47             |1                |25.470000      |0.00              |25.47              |
|2020-02-28|2020|2    |Gen_Z      |Middle_Income|Male       |Mastercard|Debit          |Online/Unknown     |4121|29.16             |1                |29.160000