In [1]:
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
#1. Create spark session obj
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_fraud_detection_dataMart").config("spark.driver.memory", "8g").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
# CRITICAL FIXES: Must be run BEFORE loading data to prevent ClassCastException
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")
# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
# 3. Disable Spark's optimizing components (Forces safer execution path)
spark.conf.set("spark.sql.cbo.enabled", "false") 
spark.conf.set("spark.sql.codegen.wholeStage", "false")
# 4. Force Hive SerDe (Ultimate attempt to bypass native Spark reader)
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [4]:
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField,DoubleType , IntegerType, StringType, DecimalType, TimestampType, BooleanType

In [8]:
# --- 2. Load the Tables directly from the Database ---
print("Loading Silver Layer tables from Hive...")

df_trans_silver = spark.table("financial_db.transactions_silver")
df_users_silver = spark.table("financial_db.users_silver")
df_cards_silver = spark.table("financial_db.cards_silver")

print("--- 2. Fixing Users Table ---")

# 1. Generate 'user_id' using Row Number
rdd_with_id = df_users_silver.rdd.zipWithIndex().map(lambda x: (x[1],) + tuple(x[0]))

# 2. Create DataFrame with 'user_id'
new_column_names = ["user_id"] + df_users_silver.columns
df_users_indexed = spark.createDataFrame(rdd_with_id, new_column_names)

# 3. Select and Rename Users
df_users_prep = df_users_indexed.select(
    F.col("user_id"),
    F.col("person_id").alias("person_name"),
    F.col("current_age"),
    F.coalesce(
        F.regexp_replace(F.col("yearly_income_person").cast("string"), "[^0-9\\.\\-]", "").cast("double"), 
        F.lit(0.0)
    ).alias("yearly_income"),
    F.col("fico_score"),
    F.col("state").alias("user_home_state")
)

print("--- 3. Prep Transactions & Cards ---")

# A. Prepare Transactions
# FIX: Map 'user' -> 'user_id' and 'card' -> 'card_id' during the select
df_trans_prep = df_trans_silver.select(
    F.col("user").alias("user_id"),      # Changed from user_id to user.alias
    F.col("card").alias("card_id"),      # Changed from card_id to card.alias
    F.col("transaction_timestamp"),
    F.col("merchant_name"),
    F.col("merchant_state"),
    F.col("zip"),
    F.col("mcc").alias("merchant_category"), 
    F.col("errors").alias("error_code"), 
    F.col("use_chip"),
    F.coalesce(F.col("amount").cast("double"), F.lit(0.0)).alias("amount"),
    F.when(F.col("is_fraud") == "Yes", 1).otherwise(0).alias("label_is_fraud")
).withColumn("hour_of_day", F.hour(F.col("transaction_timestamp")))

# B. Prepare Cards
df_cards_prep = df_cards_silver.select(
    F.col("user").alias("user_id"),
    F.col("card_index").alias("card_id"),
    F.col("card_brand"),
    F.col("card_type"),
    F.coalesce(F.col("credit_limit").cast("double"), F.lit(0.0)).alias("credit_limit")
)

print("Transactions & Cards Prepped successfully.")

Loading Silver Layer tables from Hive...
--- 2. Fixing Users Table ---
--- 3. Prep Transactions & Cards ---
Transactions & Cards Prepped successfully.


In [12]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [11]:
# --- 4. Transactions -> customer-level Features ---

# Use df_trans_prep since we already renamed 'user' to 'user_id' 
# and 'card' to 'card_id' there.

# 1. Reference date (latest transaction date in data)
ref_date = df_trans_prep.agg(
    F.max("transaction_timestamp").alias("max_ts")
).collect()[0]["max_ts"]

# 2. Aggregate transaction features per user
txn_features = (
    df_trans_prep  # Using the prepped dataframe
    .groupBy("user_id") # This column name now exists
    .agg(
        F.count("*").alias("total_transactions"),
        F.sum("amount").alias("total_spent"),
        F.avg("amount").alias("avg_transaction_amount"),
        F.max("transaction_timestamp").alias("last_transaction_date"),
        F.countDistinct("merchant_name").alias("distinct_merchants"),
        F.countDistinct("merchant_category").alias("distinct_mcc"), # Use the alias we created earlier
        F.sum(F.col("label_is_fraud")).alias("fraud_txn_count"), # Use the integer column we created
        F.sum(F.when(F.col("error_code").isNotNull(), 1).otherwise(0)).alias("error_txn_count"), # Use the alias
        F.sum(
            F.when(
                F.col("transaction_timestamp") >= F.date_sub(F.lit(ref_date), 90), 1
            ).otherwise(0)
        ).alias("transactions_last_90_days")
    )
)

print("Customer-level features aggregated successfully.")
txn_features.show(5)

Customer-level features aggregated successfully.
+-------+------------------+------------------+----------------------+---------------------+------------------+------------+---------------+---------------+-------------------------+
|user_id|total_transactions|       total_spent|avg_transaction_amount|last_transaction_date|distinct_merchants|distinct_mcc|fraud_txn_count|error_txn_count|transactions_last_90_days|
+-------+------------------+------------------+----------------------+---------------------+------------------+------------+---------------+---------------+-------------------------+
|   1238|             14520| 415272.9400000002|    28.600064738292023|  2020-02-28 17:43:00|               475|          84|              3|          14520|                      245|
|   1088|             24068|2007528.7300000004|     83.41070009971749|  2020-02-28 19:06:00|               615|          83|              8|          24068|                      326|
|    148|             22290|1048048.

In [13]:
#Churn Label Creation

txn_features = txn_features.withColumn(
    "days_since_last_transaction",
    F.datediff(F.lit(ref_date), F.col("last_transaction_date"))
)

# Churn = 1 if no transaction in last 90 days
txn_features = txn_features.withColumn(
    "churn",
    F.when(F.col("days_since_last_transaction") > 90, 1).otherwise(0)
)


In [14]:
#Users Table -> Select Relevant Columns

user_features = (
    df_users_silver
    .select(
        F.col("person_id").alias("user_id"),
        "current_age",
        "gender",
        "state",
        "yearly_income_person",
        "total_debt",
        "fico_score",
        "num_credit_cards"
    )
)


In [15]:
#Cards Table -> Aggregate Per User

card_features = (
    df_cards_silver
    .groupBy(F.col("user").alias("user_id"))
    .agg(
        F.count("*").alias("total_cards"),
        F.avg("credit_limit").alias("avg_credit_limit"),
        F.max("credit_limit").alias("max_credit_limit"),
        F.sum(F.when(F.col("card_on_dark_web") == "Yes", 1).otherwise(0))
            .alias("cards_on_dark_web"),
        F.avg(F.when(F.col("has_chip") == "Yes", 1).otherwise(0))
            .alias("chip_enabled_ratio")
    )
)


In [16]:
#Join All Features

final_df = (
    txn_features
    .join(user_features, on="user_id", how="left")
    .join(card_features, on="user_id", how="left")
)


In [17]:
from pyspark.sql.functions import col

final_df = (
    final_df
    .withColumn("total_spent", col("total_spent").cast("double"))
    .withColumn("avg_transaction_amount", col("avg_transaction_amount").cast("double"))
    .withColumn("yearly_income_person", col("yearly_income_person").cast("double"))
    .withColumn("total_debt", col("total_debt").cast("double"))
    .withColumn("avg_credit_limit", col("avg_credit_limit").cast("double"))
    .withColumn("max_credit_limit", col("max_credit_limit").cast("double"))
)


In [18]:
final_df = final_df.fillna({
    "fraud_txn_count": 0,
    "error_txn_count": 0,
    "transactions_last_90_days": 0,
    "total_cards": 0,
    "cards_on_dark_web": 0,
    "chip_enabled_ratio": 0,
    "avg_credit_limit": 0,
    "max_credit_limit": 0
})


In [19]:
final_df.printSchema()
# final_df.sample(fraction=0.0001, seed=42).show(5)


# print("Final dataset count:", final_df.count())


root
 |-- user_id: integer (nullable = true)
 |-- total_transactions: long (nullable = false)
 |-- total_spent: double (nullable = true)
 |-- avg_transaction_amount: double (nullable = true)
 |-- last_transaction_date: timestamp (nullable = true)
 |-- distinct_merchants: long (nullable = false)
 |-- distinct_mcc: long (nullable = false)
 |-- fraud_txn_count: long (nullable = false)
 |-- error_txn_count: long (nullable = false)
 |-- transactions_last_90_days: long (nullable = false)
 |-- days_since_last_transaction: integer (nullable = true)
 |-- churn: integer (nullable = false)
 |-- current_age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- state: string (nullable = true)
 |-- yearly_income_person: double (nullable = true)
 |-- total_debt: double (nullable = true)
 |-- fico_score: integer (nullable = true)
 |-- num_credit_cards: integer (nullable = true)
 |-- total_cards: long (nullable = false)
 |-- avg_credit_limit: double (nullable = false)
 |-- max_credit_li

In [20]:
final_df.show(10)

Py4JJavaError: An error occurred while calling o634.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 19.0 failed 1 times, most recent failure: Lost task 0.0 in stage 19.0 (TID 5303, localhost, executor driver): java.lang.ClassCastException: org.apache.hadoop.io.IntWritable cannot be cast to org.apache.hadoop.io.Text
	at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.getPrimitiveWritableObject(WritableStringObjectInspector.java:41)
	at org.apache.spark.sql.hive.HiveInspectors$$anonfun$unwrapperFor$23.apply(HiveInspectors.scala:547)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$14$$anonfun$apply$15.apply(TableReader.scala:444)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$14$$anonfun$apply$15.apply(TableReader.scala:444)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$fillObject$2.apply(TableReader.scala:460)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$fillObject$2.apply(TableReader.scala:451)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$doExecute$1$$anonfun$4.apply(HashAggregateExec.scala:105)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$doExecute$1$$anonfun$4.apply(HashAggregateExec.scala:102)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$25.apply(RDD.scala:875)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$25.apply(RDD.scala:875)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.ClassCastException: org.apache.hadoop.io.IntWritable cannot be cast to org.apache.hadoop.io.Text
	at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.getPrimitiveWritableObject(WritableStringObjectInspector.java:41)
	at org.apache.spark.sql.hive.HiveInspectors$$anonfun$unwrapperFor$23.apply(HiveInspectors.scala:547)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$14$$anonfun$apply$15.apply(TableReader.scala:444)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$14$$anonfun$apply$15.apply(TableReader.scala:444)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$fillObject$2.apply(TableReader.scala:460)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$fillObject$2.apply(TableReader.scala:451)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$doExecute$1$$anonfun$4.apply(HashAggregateExec.scala:105)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$doExecute$1$$anonfun$4.apply(HashAggregateExec.scala:102)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$25.apply(RDD.scala:875)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$25.apply(RDD.scala:875)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
df_cards_silver.select("expires").show(truncate=False)