In [15]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [16]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_merchant_recommendation_dataMart").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [17]:
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")
# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
# 3. Disable Spark's optimizing components (Forces safer execution path)
spark.conf.set("spark.sql.cbo.enabled", "false") 
spark.conf.set("spark.sql.codegen.wholeStage", "false")
# 4. Force Hive SerDe (Ultimate attempt to bypass native Spark reader)
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [18]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType, LongType, IntegerType, TimestampType

In [22]:
# Load the Tables directly from the Database
# We use the standard 'database_name.table_name' format

print("Loading Silver Layer tables from Hive...")

# A. Transactions Table
df_trans_silver = spark.table("financial_db.transactions_silver")

# B. Users Table
df_users_silver = spark.table("financial_db.users_silver")

# C. Cards Table
df_cards_silver = spark.table("financial_db.cards_silver")

print("Loading command executed")

Loading Silver Layer tables from Hive...
Loading command executed


In [23]:
# run only if you are not sure the files are loaded
# Quick Verification
print(f"Transactions Count: {df_trans_silver.count()}")
print(f"Users Count: {df_users_silver.count()}")
print(f"Cards Count: {df_cards_silver.count()}")

Transactions Count: 24386900
Users Count: 2000
Cards Count: 6146


In [24]:
# Previewing Schema to ensure types are correct
df_trans_silver.printSchema()
df_users_silver.printSchema()
df_cards_silver.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- apartment: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true

In [25]:
print("--- 2. Fixing Users Table ---")

# 1. Generate 'user_id' using Row Number (0, 1, 2...)
rdd_with_id = df_users_silver.rdd.zipWithIndex().map(lambda x: (x[1],) + tuple(x[0]))

# 2. Create DataFrame with 'user_id' as the first column
new_column_names = ["user_id"] + df_users_silver.columns
df_users_indexed = spark.createDataFrame(rdd_with_id, new_column_names)

# 3. Select and Rename (Strict Cleaning of Income)
df_users_prep = df_users_indexed.select(
    F.col("user_id"),
    F.col("person_id").alias("person_name"),
    F.col("current_age"),
    F.coalesce(
        F.regexp_replace(F.col("yearly_income_person").cast("string"), "[^0-9\\.\\-]", "").cast("double"), 
        F.lit(0.0)
    ).alias("yearly_income"),
    F.col("fico_score"),
    F.col("state").alias("user_home_state")
)

print("Users Table Prepped (Income Strictly Cleaned).")

--- 2. Fixing Users Table ---
Users Table Prepped (Income Strictly Cleaned).


In [31]:
#cheking the table updation
# df_users_prep.head(5)

In [32]:
print("--- 3. Prep Transactions & Cards ---")

# A. Prepare Transactions
df_trans_prep = df_trans_silver.select(
    F.col("user_id"),     
    F.col("card_id"),     
    F.col("transaction_timestamp"),
    F.col("merchant_name"),
    F.col("merchant_state"),
    F.col("zip"),         # Zip is already string in Silver, so strictly taking it is fine
    F.col("mcc").alias("merchant_category"), 
    F.col("errors").alias("error_code"), 
    F.col("use_chip"),
    F.coalesce(F.col("amount").cast("double"), F.lit(0.0)).alias("amount"),
    F.when(F.col("is_fraud") == "Yes", 1).otherwise(0).alias("label_is_fraud")
).withColumn("hour_of_day", F.hour(F.col("transaction_timestamp")))

# B. Prepare Cards
df_cards_prep = df_cards_silver.select(
    F.col("user").alias("user_id"),
    F.col("card_index").alias("card_id"),
    F.col("card_brand"),
    F.col("card_type"),
    F.coalesce(F.col("credit_limit").cast("double"), F.lit(0.0)).alias("credit_limit")
)

print("Transactions & Cards Prepped (Schema Updated).")

--- 3. Prep Transactions & Cards ---
Transactions & Cards Prepped (Schema Updated).


In [33]:
print("--- 4. Performing Joins (Transactions + Users + Cards) ---")

# 1. Join Transactions with Users (on 'user_id')
# We use "left" join to keep all transactions even if user data is missing (rare)
df_trans_users = df_trans_prep.join(
    df_users_prep,
    on="user_id",
    how="left"
)

# filtering features
df_trans_users = df_trans_users['user_id', 'card_id', ]

# 2. Join the Result with Cards (on 'user_id' AND 'card_id')
# This ensures we match the specific card used in the transaction
df_full_join = df_trans_users.join(
    df_cards_prep,
    on=["user_id", "card_id"],
    how="left"
)

print(f"Total Joined Rows: {df_full_join.count()}")

# Validation: Check schema to ensure no duplicate columns and keys are correct
print("--- Schema of Joined Data ---")
df_full_join.printSchema()

print("Preview of the Wide Table:")
df_full_join.select(
    "user_id", 
    "card_id", 
    "amount", 
    "person_name", 
    "card_brand", 
    "merchant_category"
).show(5)

--- 4. Performing Joins (Transactions + Users + Cards) ---
Total Joined Rows: 24386900
--- Schema of Joined Data ---
root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- merchant_category: integer (nullable = true)
 |-- error_code: string (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- amount: double (nullable = false)
 |-- label_is_fraud: integer (nullable = false)
 |-- hour_of_day: integer (nullable = true)
 |-- person_name: string (nullable = true)
 |-- current_age: long (nullable = true)
 |-- yearly_income: double (nullable = true)
 |-- fico_score: long (nullable = true)
 |-- user_home_state: string (nullable = true)
 |-- card_brand: string (nullable = true)
 |-- card_type: string (nullable = true)
 |-- credit_limit: double (nullable = true)

Preview o