In [23]:
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [24]:
#Entrypoint 2.x
#1. Create spark session obj
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_fraud_detection_dataMart").config("spark.driver.memory", "8g").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [25]:
# CRITICAL FIXES: Must be run BEFORE loading data to prevent ClassCastException
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")
# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
# 3. Disable Spark's optimizing components (Forces safer execution path)
spark.conf.set("spark.sql.cbo.enabled", "false") 
spark.conf.set("spark.sql.codegen.wholeStage", "false")
# 4. Force Hive SerDe (Ultimate attempt to bypass native Spark reader)
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [26]:
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField,DoubleType , IntegerType, StringType, DecimalType, TimestampType, BooleanType

In [28]:
# 2. Load the Tables directly from the Database
# We use the standard 'database_name.table_name' format

print("Loading Silver Layer tables from Hive...")

# A. Transactions Table
df_trans_silver = spark.table("financial_db.transactions_silver") \
    .withColumnRenamed("user", "user_id") \
    .withColumnRenamed("card", "card_id")


# B. Users Table
df_users_silver = spark.table("financial_db.users_silver")

# C. Cards Table
df_cards_silver = spark.table("financial_db.cards_silver")

# 3. Quick Verification
print(f"Transactions Count: {df_trans_silver.count()}")
print(f"Users Count: {df_users_silver.count()}")
print(f"Cards Count: {df_cards_silver.count()}")

# 4.Preview Schema to ensure types are correct
df_trans_silver.printSchema()
df_users_silver.printSchema()
df_cards_silver.printSchema()

Loading Silver Layer tables from Hive...
Transactions Count: 24386900
Users Count: 2000
Cards Count: 6146
root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- a

In [29]:
print("--- 2. Fixing Users Table ---")

# 1. Generate 'user_id' using Row Number (0, 1, 2...)
rdd_with_id = df_users_silver.rdd.zipWithIndex().map(lambda x: (x[1],) + tuple(x[0]))

# 2. Create DataFrame with 'user_id' as the first column
new_column_names = ["user_id"] + df_users_silver.columns
df_users_indexed = spark.createDataFrame(rdd_with_id, new_column_names)

# 3. Select and Rename (Strict Cleaning of Income)
df_users_prep = df_users_indexed.select(
    F.col("user_id"),
    F.col("person_id").alias("person_name"),
    F.col("current_age"),
    # FIX: Strip non-numeric chars from income before casting to safely handle Text data
    F.coalesce(
        F.regexp_replace(F.col("yearly_income_person").cast("string"), "[^0-9\\.\\-]", "").cast("double"), 
        F.lit(0.0)
    ).alias("yearly_income"),
    F.col("fico_score"),
    F.col("state").alias("user_home_state")
)

print("Users Table Prepped (Income Strictly Cleaned).")

--- 2. Fixing Users Table ---
Users Table Prepped (Income Strictly Cleaned).


In [30]:
# A. Prepare Transactions
# UPDATED: Select 'user_id' and 'card_id' directly (no need to alias from 'user'/'card' anymore)
df_trans_prep = df_trans_silver.select(
    F.col("user_id"),     # <--- Updated: Reads directly from new schema
    F.col("card_id"),     # <--- Updated: Reads directly from new schema
    F.col("transaction_timestamp"),
    F.col("merchant_name"),
    F.col("merchant_state"),
    F.col("zip"),         # Zip is already string in Silver, so strictly taking it is fine
    F.col("mcc").alias("merchant_category"), 
    F.col("errors").alias("error_code"), 
    F.col("use_chip"),
    # Keep defensive cleaning just in case, but reading directly is usually fine now
    F.coalesce(F.col("amount").cast("double"), F.lit(0.0)).alias("amount"),
    F.when(F.col("is_fraud") == "Yes", 1).otherwise(0).alias("label_is_fraud")
).withColumn("hour_of_day", F.hour(F.col("transaction_timestamp")))

# B. Prepare Cards
# Cards table still uses 'user' (based on your cards script), so we rename it here.
df_cards_prep = df_cards_silver.select(
    F.col("user").alias("user_id"),
    F.col("card_index").alias("card_id"),
    F.col("card_brand"),
    F.col("card_type"),
    # Keep defensive cleaning
    F.coalesce(F.col("credit_limit").cast("double"), F.lit(0.0)).alias("credit_limit")
)

print("Transactions & Cards Prepped (Schema Updated).")

Transactions & Cards Prepped (Schema Updated).
