In [1]:



# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_merchant_recommendation_dataMart").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")
# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
# 3. Disable Spark's optimizing components (Forces safer execution path)
spark.conf.set("spark.sql.cbo.enabled", "false") 
spark.conf.set("spark.sql.codegen.wholeStage", "false")
# 4. Force Hive SerDe (Ultimate attempt to bypass native Spark reader)
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [4]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType, LongType, IntegerType, TimestampType

In [5]:
# Load the Tables directly from the Database
# We use the standard 'database_name.table_name' format

print("Loading Silver Layer tables from Hive...")

# A. Transactions Table
df_trans_silver = spark.table("financial_db.transactions_silver")

# B. Users Table
df_users_silver = spark.table("financial_db.users_silver")

# C. Cards Table
df_cards_silver = spark.table("financial_db.cards_silver")

print("Loading command executed")

Loading Silver Layer tables from Hive...
Loading command executed


In [6]:
# run only if you are not sure the files are loaded
# Quick Verification
print(f"Transactions Count: {df_trans_silver.count()}")
print(f"Users Count: {df_users_silver.count()}")
print(f"Cards Count: {df_cards_silver.count()}")

Transactions Count: 24386900
Users Count: 2000
Cards Count: 6146


In [7]:
# Previewing Schema to ensure types are correct
df_trans_silver.printSchema()
df_users_silver.printSchema()
df_cards_silver.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- apartment: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true

In [12]:
# ==========================================
# 1. Renaming & Preparation (Golden Layer Prep)
# ==========================================
print("--- Step 1: Renaming and Preparing DataFrames ---")

# --- A. Transactions Prep ---
# Already has snake_case, but ensuring we have the time components needed
df_trans_renamed = df_trans_silver.withColumnRenamed("zip", "merchant_zip") \
    .withColumnRenamed("errors", "transaction_errors") \
    .withColumn("day", F.dayofmonth(F.col("transaction_timestamp"))) \
    .withColumn("time", F.date_format(F.col("transaction_timestamp"), "HH:mm:ss"))

# --- B. Users Prep ---
# 1. Generate 'user_id' via Indexing (Crucial for joining)
rdd_with_id = df_users_silver.rdd.zipWithIndex().map(lambda x: (x[1],) + tuple(x[0]))
user_cols = ["user_id"] + df_users_silver.columns
df_users_indexed = spark.createDataFrame(rdd_with_id, user_cols)

# 2. Rename columns to your convenience
df_users_renamed = df_users_indexed \
    .withColumnRenamed("current_age", "age") \
    .withColumnRenamed("city", "user_city") \
    .withColumnRenamed("state", "user_state") \
    .withColumnRenamed("per_capita_income_zipcode", "per_capita_income") \
    .withColumnRenamed("yearly_income_person", "yearly_income") \
    .withColumnRenamed("total_debt", "total_debt")

# --- C. Cards Prep ---
# Rename keys to match Transactions and create Date for Age Calculation
df_cards_renamed = df_cards_silver \
    .withColumnRenamed("user", "user_id") \
    .withColumnRenamed("card_index", "card_id") \
    .withColumnRenamed("card_on_dark_web", "dark_web_exposure") \
    .withColumnRenamed("acct_opened_year", "acct_year") \
    .withColumnRenamed("acct_opened_month", "acct_month")

--- Step 1: Renaming and Preparing DataFrames ---


In [16]:
# ==========================================
# 2. Join Transactions with Users
# ==========================================
print("--- Step 2: Joining Transactions with Users ---")

df_join_users = df_trans_renamed.join(
    df_users_renamed,
    on="user_id",
    how="left"
)

--- Step 2: Joining Transactions with Users ---


In [17]:
# ==========================================
# 3. First Selection (Intermediate DataFrame)
# ==========================================
print("--- Step 3: Selecting Specific User & Transaction Columns ---")

df_resultant = df_join_users.select(
    # Keys
    F.col("user_id"),
    F.col("card_id"),
    
    # Time
    F.col("year"),
    F.col("month"),
    F.col("day"),
    F.col("time"),
    F.col("transaction_timestamp"), # Keeping for calculation
    
    # User Demographics
    F.col("age"),
    F.col("gender"),
    F.col("user_city").alias("city"),
    F.col("user_state").alias("state"),
    F.col("per_capita_income"),
    F.col("yearly_income"),
    F.col("fico_score"),
    F.col("num_credit_cards"),
    F.col("total_debt")
)

--- Step 3: Selecting Specific User & Transaction Columns ---


In [18]:
# ==========================================
# 4. Join with Cards
# ==========================================
print("--- Step 4: Joining with Cards (on user_id AND card_id) ---")
# Note: We must join on User ID too, because Card ID 0 belongs to User 0, User 1, etc.

df_join_cards = df_resultant.join(
    df_cards_renamed,
    on=["user_id", "card_id"],
    how="left"
)

--- Step 4: Joining with Cards (on user_id AND card_id) ---


In [23]:
# ==========================================
# 5. Final Selection & Feature Engineering
# ==========================================
print("--- Step 5: Final Selection & Creating 'Age of Card' ---")

# Calculate Age of Card in Years (Transaction Year - Account Open Year)
# We use the columns joined from the Cards table
df_final = df_join_cards.withColumn(
    "age_of_card_years",
    F.col("year") - F.col("acct_year")
).select(
    # Previous Columns
    F.col("user_id"),
    F.col("card_id"),
    F.col("age").alias("user_age"),
    F.col("gender"),
    F.col("city"),
    F.col("state"),
    F.col("per_capita_income"),
    F.col("yearly_income"),
    F.col("fico_score"),
    F.col("num_credit_cards"),
    F.col("total_debt"),
    F.col("year"),
    F.col("month"),
    F.col("day"),
    F.col("time"),
    
    # Card Specific Columns
    F.col("card_type"),
    F.col("credit_limit"),
    F.col("has_chip"),
    F.col("dark_web_exposure"),
    
    # Derived Feature
    F.col("age_of_card_years").alias("age_of_card")
)

--- Step 5: Final Selection & Creating 'Age of Card' ---


In [24]:
from pyspark.sql.functions import col, when, lit, window, avg, sum, count, countDistinct, expr

In [25]:
# ==========================================
# 6. Advanced Feature Engineering
# ==========================================
print("--- Step 6: Creating Composite Features (Enrichment) ---")

# We work on df_join_cards because we need the raw columns (amount, timestamp, etc.)
# which might have been dropped in the previous selection step.

df_enriched = df_join_cards.withColumn(
    # A. Financial Ratios
    "debt_to_income_ratio",
    when(col("yearly_income") > 0, 
         col("total_debt") / col("yearly_income")
    ).otherwise(0.0)
).withColumn(
    "credit_power",
    when(col("yearly_income") > 0, 
         col("credit_limit") / col("yearly_income")
    ).otherwise(0.0)
).withColumn(
    # B. Demographic Segmentation (Life Stage)
    "life_stage",
    when(col("age") < 30, "Young_Adult")
    .when((col("age") >= 30) & (col("age") < 60), "Working_Professional")
    .otherwise("Retired")
).withColumn(
    # C. Traveller Flag (User State != Merchant State)
    # Important: Handle nulls for Online transactions
    "is_travelling",
    when(
        (col("merchant_state").isNotNull()) & 
        (col("user_state").isNotNull()) & 
        (col("merchant_state") != col("user_state")), 
        1
    ).otherwise(0)
).withColumn(
    # D. Tech Complexity (Chip vs Online vs Swipe)
    "tech_type",
    when(col("use_chip").like("%Chip%"), "Chip")
    .when(col("use_chip").like("%Online%"), "Online")
    .otherwise("Swipe")
)

--- Step 6: Creating Composite Features (Enrichment) ---


AnalysisException: "cannot resolve '`merchant_state`' given input columns: [yearly_income, num_credit_cards, financial_db.transactions_silver.transaction_timestamp, financial_db.cards_silver.year_pin_last_changed, financial_db.cards_silver.cards_issued, financial_db.cards_silver.credit_limit, financial_db.transactions_silver.year, financial_db.transactions_silver.user_id, acct_year, financial_db.cards_silver.card_brand, financial_db.cards_silver.card_number, financial_db.cards_silver.cvv, acct_month, dark_web_exposure, total_debt, financial_db.cards_silver.expires, per_capita_income, life_stage, financial_db.transactions_silver.card_id, state, time, city, financial_db.cards_silver.expires_year, day, credit_power, financial_db.transactions_silver.month, financial_db.cards_silver.card_type, debt_to_income_ratio, financial_db.cards_silver.expires_month, gender, age, financial_db.cards_silver.has_chip, fico_score];;\n'Project [user_id#0, card_id#1, year#12, month#13, day#306, time#322, transaction_timestamp#11, age#377L, gender#345, city#650, state#651, per_capita_income#437, yearly_income#457, fico_score#356L, num_credit_cards#357L, total_debt#477, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, ... 10 more fields]\n+- Project [user_id#0, card_id#1, year#12, month#13, day#306, time#322, transaction_timestamp#11, age#377L, gender#345, city#650, state#651, per_capita_income#437, yearly_income#457, fico_score#356L, num_credit_cards#357L, total_debt#477, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, ... 9 more fields]\n   +- Project [user_id#0, card_id#1, year#12, month#13, day#306, time#322, transaction_timestamp#11, age#377L, gender#345, city#650, state#651, per_capita_income#437, yearly_income#457, fico_score#356L, num_credit_cards#357L, total_debt#477, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, ... 8 more fields]\n      +- Project [user_id#0, card_id#1, year#12, month#13, day#306, time#322, transaction_timestamp#11, age#377L, gender#345, city#650, state#651, per_capita_income#437, yearly_income#457, fico_score#356L, num_credit_cards#357L, total_debt#477, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, ... 7 more fields]\n         +- Project [user_id#0, card_id#1, year#12, month#13, day#306, time#322, transaction_timestamp#11, age#377L, gender#345, city#650, state#651, per_capita_income#437, yearly_income#457, fico_score#356L, num_credit_cards#357L, total_debt#477, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, ... 6 more fields]\n            +- Join LeftOuter, ((user_id#0 = user_id#497) && (card_id#1 = card_id#514))\n               :- Project [user_id#0, card_id#1, year#12, month#13, day#306, time#322, transaction_timestamp#11, age#377L, gender#345, user_city#397 AS city#650, user_state#417 AS state#651, per_capita_income#437, yearly_income#457, fico_score#356L, num_credit_cards#357L, total_debt#477]\n               :  +- Project [user_id#0, card_id#1, amount#2, use_chip#3, merchant_name#4, merchant_city#5, merchant_state#6, merchant_zip#276, mcc#8, transaction_errors#291, is_fraud#10, transaction_timestamp#11, year#12, month#13, day#306, time#322, person_id#340, age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, ... 10 more fields]\n               :     +- Join LeftOuter, (cast(user_id#0 as bigint) = user_id#339L)\n               :        :- Project [user_id#0, card_id#1, amount#2, use_chip#3, merchant_name#4, merchant_city#5, merchant_state#6, merchant_zip#276, mcc#8, transaction_errors#291, is_fraud#10, transaction_timestamp#11, year#12, month#13, day#306, date_format(transaction_timestamp#11, HH:mm:ss, Some(Asia/Kolkata)) AS time#322]\n               :        :  +- Project [user_id#0, card_id#1, amount#2, use_chip#3, merchant_name#4, merchant_city#5, merchant_state#6, merchant_zip#276, mcc#8, transaction_errors#291, is_fraud#10, transaction_timestamp#11, year#12, month#13, dayofmonth(cast(transaction_timestamp#11 as date)) AS day#306]\n               :        :     +- Project [user_id#0, card_id#1, amount#2, use_chip#3, merchant_name#4, merchant_city#5, merchant_state#6, merchant_zip#276, mcc#8, errors#9 AS transaction_errors#291, is_fraud#10, transaction_timestamp#11, year#12, month#13]\n               :        :        +- Project [user_id#0, card_id#1, amount#2, use_chip#3, merchant_name#4, merchant_city#5, merchant_state#6, zip#7 AS merchant_zip#276, mcc#8, errors#9, is_fraud#10, transaction_timestamp#11, year#12, month#13]\n               :        :           +- SubqueryAlias `financial_db`.`transactions_silver`\n               :        :              +- HiveTableRelation `financial_db`.`transactions_silver`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [user_id#0, card_id#1, amount#2, use_chip#3, merchant_name#4, merchant_city#5, merchant_state#6, zip#7, mcc#8, errors#9, is_fraud#10, transaction_timestamp#11], [year#12, month#13]\n               :        +- Project [user_id#339L, person_id#340, age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, user_city#397, user_state#417, zipcode#350, latitude#351, longitude#352, per_capita_income#437, yearly_income#457, total_debt#355 AS total_debt#477, fico_score#356L, num_credit_cards#357L]\n               :           +- Project [user_id#339L, person_id#340, age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, user_city#397, user_state#417, zipcode#350, latitude#351, longitude#352, per_capita_income#437, yearly_income_person#354 AS yearly_income#457, total_debt#355, fico_score#356L, num_credit_cards#357L]\n               :              +- Project [user_id#339L, person_id#340, age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, user_city#397, user_state#417, zipcode#350, latitude#351, longitude#352, per_capita_income_zipcode#353 AS per_capita_income#437, yearly_income_person#354, total_debt#355, fico_score#356L, num_credit_cards#357L]\n               :                 +- Project [user_id#339L, person_id#340, age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, user_city#397, state#349 AS user_state#417, zipcode#350, latitude#351, longitude#352, per_capita_income_zipcode#353, yearly_income_person#354, total_debt#355, fico_score#356L, num_credit_cards#357L]\n               :                    +- Project [user_id#339L, person_id#340, age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, city#348 AS user_city#397, state#349, zipcode#350, latitude#351, longitude#352, per_capita_income_zipcode#353, yearly_income_person#354, total_debt#355, fico_score#356L, num_credit_cards#357L]\n               :                       +- Project [user_id#339L, person_id#340, current_age#341L AS age#377L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, city#348, state#349, zipcode#350, latitude#351, longitude#352, per_capita_income_zipcode#353, yearly_income_person#354, total_debt#355, fico_score#356L, num_credit_cards#357L]\n               :                          +- LogicalRDD [user_id#339L, person_id#340, current_age#341L, retirement_age#342L, birth_year#343L, birth_month#344L, gender#345, address#346, apartment#347, city#348, state#349, zipcode#350, latitude#351, longitude#352, per_capita_income_zipcode#353, yearly_income_person#354, total_debt#355, fico_score#356L, num_credit_cards#357L], false\n               +- Project [user_id#497, card_id#514, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, year_pin_last_changed#74, dark_web_exposure#531, acct_opened_month#76 AS acct_month#565, acct_year#548, expires_month#78, expires_year#79]\n                  +- Project [user_id#497, card_id#514, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, year_pin_last_changed#74, dark_web_exposure#531, acct_opened_month#76, acct_opened_year#77 AS acct_year#548, expires_month#78, expires_year#79]\n                     +- Project [user_id#497, card_id#514, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, year_pin_last_changed#74, card_on_dark_web#75 AS dark_web_exposure#531, acct_opened_month#76, acct_opened_year#77, expires_month#78, expires_year#79]\n                        +- Project [user_id#497, card_index#65 AS card_id#514, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, year_pin_last_changed#74, card_on_dark_web#75, acct_opened_month#76, acct_opened_year#77, expires_month#78, expires_year#79]\n                           +- Project [user#64 AS user_id#497, card_index#65, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, year_pin_last_changed#74, card_on_dark_web#75, acct_opened_month#76, acct_opened_year#77, expires_month#78, expires_year#79]\n                              +- SubqueryAlias `financial_db`.`cards_silver`\n                                 +- HiveTableRelation `financial_db`.`cards_silver`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [user#64, card_index#65, card_brand#66, card_type#67, card_number#68, expires#69, cvv#70, has_chip#71, cards_issued#72, credit_limit#73, year_pin_last_changed#74, card_on_dark_web#75, acct_opened_month#76, acct_opened_year#77, expires_month#78, expires_year#79]\n"

In [None]:
#==========================================
# 7. Aggregation (The Golden Layer Logic)
# ==========================================
print("--- Step 7: Aggregating into 15-Minute Windows ---")

df_gold_mart = df_enriched.groupBy(
    window(col("transaction_timestamp"), "15 minutes")
).agg(
    # --- 1. Core Forecasting Metrics (Target Variables) ---
    count("*").alias("transaction_count"),      # Load (TPS)
    sum("amount").alias("total_volume"),        # Liability ($)
    avg("amount").alias("avg_transaction_size"),
    
    # --- 2. System Health Metrics ---
    # Adjust 'errors' column name if it was renamed to 'transaction_errors' earlier
    count(when(col("transaction_errors") != "N/A", 1)).alias("error_count"),
    count(when(col("is_fraud") == "Yes", 1)).alias("fraud_count"),
    
    # --- 3. Enriched Behavioral Metrics (NEW) ---
    # These help the model understand WHY load is happening
    
    # Financial Health of current traffic
    avg("debt_to_income_ratio").alias("avg_debt_ratio"),
    avg("credit_power").alias("avg_credit_power"),
    
    # Traveller Traffic (Spikes during holidays)
    sum("is_travelling").alias("traveller_count"),
    
    # Life Stage Breakdown (Pivot Logic)
    # Count how many of each group are active in this window
    count(when(col("life_stage") == "Young_Adult", 1)).alias("cnt_young_adult"),
    count(when(col("life_stage") == "Working_Professional", 1)).alias("cnt_professional"),
    count(when(col("life_stage") == "Retired", 1)).alias("cnt_retired"),
    
    # Tech Type Breakdown
    count(when(col("tech_type") == "Chip", 1)).alias("cnt_chip"),
    count(when(col("tech_type") == "Online", 1)).alias("cnt_online"),
    count(when(col("tech_type") == "Swipe", 1)).alias("cnt_swipe")
)

In [None]:
# ==========================================
# 8. Final Formatting & Save
# ==========================================
print("--- Step 8: Formatting & Saving Data Mart ---")

# Extract 'ds' timestamp for Prophet and clean up schema
df_final_gold = df_gold_mart.withColumn(
    "ds", 
    col("window.start")
).select(
    "ds",
    # Targets
    col("transaction_count").cast("long"),
    col("total_volume").cast("decimal(18,2)"),
    col("avg_transaction_size").cast("decimal(10,2)"),
    
    # Health
    col("error_count").cast("long"),
    col("fraud_count").cast("long"),
    
    # Features
    col("avg_debt_ratio").cast("decimal(10,4)"),
    col("traveller_count").cast("long"),
    col("cnt_young_adult").cast("long"),
    col("cnt_professional").cast("long"),
    col("cnt_retired").cast("long"),
    col("cnt_online").cast("long"),
    col("cnt_chip").cast("long")
).fillna(0) # Impute Nulls with 0 for clean Time Series

# Preview
df_final_gold.orderBy("ds").show(10, truncate=False)
df_final_gold.printSchema()

In [20]:
# ==========================================
# 6. Preview Output
# ==========================================
print("--- Step 6: Final Data Mart Preview ---")
df_final.printSchema()
df_final.show(10)

--- Step 6: Final Data Mart Preview ---
root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- user_age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- per_capita_income: decimal(38,18) (nullable = true)
 |-- yearly_income: decimal(38,18) (nullable = true)
 |-- fico_score: long (nullable = true)
 |-- num_credit_cards: long (nullable = true)
 |-- total_debt: decimal(38,18) (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- card_type: string (nullable = true)
 |-- credit_limit: decimal(10,2) (nullable = true)
 |-- has_chip: string (nullable = true)
 |-- dark_web_exposure: string (nullable = true)
 |-- age_of_card: integer (nullable = true)

+-------+-------+--------+------+-----------+-----+--------------------+--------------------+----------+---

In [21]:
print("--- Step 6: Saving as External Hive Table ---")
# Path for the External Table
gold_path = "/user/talentum/projectMaster/warehouseDir/gold/financial_load_forecasting"

--- Step 6: Saving as External Hive Table ---


In [None]:

spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)