In [26]:



# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [27]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_merchant_recommendation_dataMart").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [28]:
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")
# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
# 3. Disable Spark's optimizing components (Forces safer execution path)
spark.conf.set("spark.sql.cbo.enabled", "false") 
spark.conf.set("spark.sql.codegen.wholeStage", "false")
# 4. Force Hive SerDe (Ultimate attempt to bypass native Spark reader)
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [29]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType, LongType, IntegerType, TimestampType

In [30]:
# Load the Tables directly from the Database
# We use the standard 'database_name.table_name' format

print("Loading Silver Layer tables from Hive...")

# A. Transactions Table
df_trans_silver = spark.table("financial_db.transactions_silver")

# B. Users Table
df_users_silver = spark.table("financial_db.users_silver")

# C. Cards Table
df_cards_silver = spark.table("financial_db.cards_silver")

print("Loading command executed")

Loading Silver Layer tables from Hive...
Loading command executed


In [31]:
# run only if you are not sure the files are loaded
# Quick Verification
print(f"Transactions Count: {df_trans_silver.count()}")
print(f"Users Count: {df_users_silver.count()}")
print(f"Cards Count: {df_cards_silver.count()}")

Transactions Count: 24386900
Users Count: 2000
Cards Count: 6146


In [32]:
# Previewing Schema to ensure types are correct
df_trans_silver.printSchema()
df_users_silver.printSchema()
df_cards_silver.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- apartment: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true

In [33]:
# ==========================================
# 1. Renaming & Preparation (Golden Layer Prep)
# ==========================================
print("--- Step 1: Renaming and Preparing DataFrames ---")

# --- A. Transactions Prep ---
# Already has snake_case, but ensuring we have the time components needed
df_trans_renamed = df_trans_silver.withColumnRenamed("zip", "merchant_zip") \
    .withColumnRenamed("errors", "transaction_errors") \
    .withColumn("day", F.dayofmonth(F.col("transaction_timestamp"))) \
    .withColumn("time", F.date_format(F.col("transaction_timestamp"), "HH:mm:ss"))

# --- B. Users Prep ---
# 1. Generate 'user_id' via Indexing (Crucial for joining)
rdd_with_id = df_users_silver.rdd.zipWithIndex().map(lambda x: (x[1],) + tuple(x[0]))
user_cols = ["user_id"] + df_users_silver.columns
df_users_indexed = spark.createDataFrame(rdd_with_id, user_cols)

# 2. Rename columns to your convenience
df_users_renamed = df_users_indexed \
    .withColumnRenamed("current_age", "age") \
    .withColumnRenamed("city", "user_city") \
    .withColumnRenamed("state", "user_state") \
    .withColumnRenamed("per_capita_income_zipcode", "per_capita_income") \
    .withColumnRenamed("yearly_income_person", "yearly_income") \
    .withColumnRenamed("total_debt", "total_debt")

# --- C. Cards Prep ---
# Rename keys to match Transactions and create Date for Age Calculation
df_cards_renamed = df_cards_silver \
    .withColumnRenamed("user", "user_id") \
    .withColumnRenamed("card_index", "card_id") \
    .withColumnRenamed("card_on_dark_web", "dark_web_exposure") \
    .withColumnRenamed("acct_opened_year", "acct_year") \
    .withColumnRenamed("acct_opened_month", "acct_month")

--- Step 1: Renaming and Preparing DataFrames ---


In [43]:
# ==========================================
# 2. Join Transactions with Users
# ==========================================
print("--- Step 2: Joining Transactions with Users ---")

df_join_users = df_trans_renamed.join(
    df_users_renamed,
    on="user_id",
    how="left"
)

--- Step 2: Joining Transactions with Users ---


In [44]:
# ==========================================
# 3. First Selection (Intermediate DataFrame)
# ==========================================
print("--- Step 3: Selecting All Required Columns ---")

df_step3 = df_join_users.select(
    # --- Keys ---
    F.col("user_id"),
    F.col("card_id"),
    
    # --- Transaction Data (CRITICAL: Don't drop these!) ---
    F.col("amount"),
    F.col("transaction_timestamp"),
    F.col("year"),
    F.col("month"),
    F.col("day"),
    F.col("time"),
    F.col("merchant_state"),        # Needed for Traveller Flag
    F.col("transaction_errors"),    # Needed for Health Metrics
    F.col("is_fraud"),              # Needed for Health Metrics
    F.col("use_chip"),              # Needed for Tech Complexity
    
    # --- User Demographics ---
    F.col("age"),
    F.col("gender"),
    F.col("user_city"),
    F.col("user_state"),            # Keeping original name to avoid confusion
    F.col("per_capita_income"),
    F.col("yearly_income"),
    F.col("fico_score"),
    F.col("num_credit_cards"),
    F.col("total_debt")
)

--- Step 3: Selecting All Required Columns ---


In [45]:
# ==========================================
# 4. Join with Cards
# ==========================================
print("--- Step 4: Joining with Cards ---")

df_join_cards = df_step3.join(
    df_cards_renamed,
    on=["user_id", "card_id"],
    how="left"
)

--- Step 4: Joining with Cards ---


In [46]:
# ==========================================
# 5. Feature Engineering
# ==========================================
print("--- Step 5: Advanced Feature Engineering ---")

df_enriched = df_join_cards.withColumn(
    # A. Age of Card (Derived)
    "age_of_card_years",
    F.col("year") - F.col("acct_year")
).withColumn(
    # B. Financial Ratios
    "debt_to_income_ratio",
    when(col("yearly_income") > 0, 
         col("total_debt") / col("yearly_income")
    ).otherwise(0.0)
).withColumn(
    "credit_power",
    when(col("yearly_income") > 0, 
         col("credit_limit") / col("yearly_income")
    ).otherwise(0.0)
).withColumn(
    # C. Life Stage
    "life_stage",
    when(col("age") < 30, "Young_Adult")
    .when((col("age") >= 30) & (col("age") < 60), "Working_Professional")
    .otherwise("Retired")
).withColumn(
    # D. Traveller Flag (FIXED: Using correct column names)
    "is_travelling",
    when(
        (col("merchant_state").isNotNull()) & 
        (col("user_state").isNotNull()) & 
        (col("merchant_state") != col("user_state")), 
        1
    ).otherwise(0)
).withColumn(
    # E. Tech Complexity
    "tech_type",
    when(col("use_chip").like("%Chip%"), "Chip")
    .when(col("use_chip").like("%Online%"), "Online")
    .otherwise("Swipe")
)

print("Features Created Successfully.")

--- Step 5: Advanced Feature Engineering ---
Features Created Successfully.


In [56]:
# ==========================================
# 6. Aggregation (Gold Layer Data Mart)
# ==========================================
print("--- Step 6: Aggregating into 15-Minute Windows ---")

df_gold_mart = df_enriched.groupBy(
    window(col("transaction_timestamp"), "15 minutes")
).agg(
    # --- Targets ---
    count("*").alias("transaction_count"),
    sum("amount").alias("total_volume"),
    avg("amount").alias("avg_transaction_size"),
    
    # --- Health ---
    count(when(col("transaction_errors") != "N/A", 1)).alias("error_count"),
    count(when(col("is_fraud") == "Yes", 1)).alias("fraud_count"),
    
    # --- Features (Averaged/Summed for the Time Window) ---
    avg("debt_to_income_ratio").alias("avg_debt_ratio"),
    avg("credit_power").alias("avg_credit_power"),
    sum("is_travelling").alias("traveller_count"),
    
    # --- Demographics Breakdown ---
    count(when(col("life_stage") == "Young_Adult", 1)).alias("cnt_young_adult"),
    count(when(col("life_stage") == "Working_Professional", 1)).alias("cnt_professional"),
    count(when(col("life_stage") == "Retired", 1)).alias("cnt_retired"),
    
    # --- Tech Breakdown ---
    count(when(col("tech_type") == "Chip", 1)).alias("cnt_chip"),
    count(when(col("tech_type") == "Online", 1)).alias("cnt_online")
)

--- Step 6: Aggregating into 15-Minute Windows ---


In [57]:
from pyspark.sql.functions import col, when, lit, window, avg, sum, count, countDistinct, expr

In [58]:
# ==========================================
# 7. Final Formatting & Preview
# ==========================================
print("--- Step 7: Final Preview ---")

df_load_forecasting_gold = df_gold_mart.withColumn(
    "ds", 
    col("window.start")
).drop("window")

--- Step 7: Final Preview ---


In [59]:
# ==========================================
# 6. Preview Output
# ==========================================
print("--- Step 6: Final Data Mart Preview ---")
df_load_forecasting_gold.printSchema()
df_load_forecasting_gold.orderBy("ds").show(5, truncate=False)

--- Step 6: Final Data Mart Preview ---
root
 |-- transaction_count: long (nullable = false)
 |-- total_volume: decimal(20,2) (nullable = true)
 |-- avg_transaction_size: decimal(14,6) (nullable = true)
 |-- error_count: long (nullable = false)
 |-- fraud_count: long (nullable = false)
 |-- avg_debt_ratio: double (nullable = true)
 |-- avg_credit_power: double (nullable = true)
 |-- traveller_count: long (nullable = true)
 |-- cnt_young_adult: long (nullable = false)
 |-- cnt_professional: long (nullable = false)
 |-- cnt_retired: long (nullable = false)
 |-- cnt_chip: long (nullable = false)
 |-- cnt_online: long (nullable = false)
 |-- ds: timestamp (nullable = true)

+-----------------+------------+--------------------+-----------+-----------+--------------+----------------+---------------+---------------+----------------+-----------+--------+----------+-------------------+
|transaction_count|total_volume|avg_transaction_size|error_count|fraud_count|avg_debt_ratio|avg_credit_power|t

In [60]:
print("--- Step 6: Saving as External Hive Table ---")
# Path for the External Table
gold_path = "/user/talentum/projectMaster/warehouseDir/gold/financial_load_forecasting"

--- Step 6: Saving as External Hive Table ---


In [61]:

spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [62]:
spark.sql("DROP TABLE IF EXISTS financial_db.load_forecasting_gold")

# Write with explicit partitioning and Hive serde override
df_load_forecasting_gold.write \
    .mode("overwrite") \
    .format("orc") \
    .option("path", "/user/talentum/projectMaster/warehouseDir/gold/financial_laod_forecasting") \
    .option("compression", "snappy") \
    .saveAsTable("financial_db.load_forecasting_gold")

print("SUCCESS: Table saved!")

SUCCESS: Table saved!


In [None]:
df_load_forecasting_gold.show(5)