In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_ PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_merchant_recommendation_dataMart").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
# CRITICAL FIXES: Must be run BEFORE loading data to prevent ClassCastException
# 1. Disable Vectorized Reader (Avoids low-level ORC data reading crash)
spark.conf.set("spark.sql.orc.enableVectorizedReader", "false")
spark.conf.set("spark.sql.hive.convertMetastoreOrc", "false")
# 2. Disable Broadcast Join (Avoids memory/shuffle crash on join)
#spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 
# 3. Disable Spark's optimizing components (Forces safer execution path)
#spark.conf.set("spark.sql.cbo.enabled", "false") 
#spark.conf.set("spark.sql.codegen.wholeStage", "false")
# 4. Force Hive SerDe (Ultimate attempt to bypass native Spark reader)
spark.conf.set("spark.sql.hive.convertMetastore", "false") 

print("Spark Config Updated for maximum stability.")

Spark Config Updated for maximum stability.


In [4]:
import pandas as pd
import numpy as np
import pyspark.sql.functions as F 
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType, DecimalType, TimestampType

In [5]:
#Load the tables directly from the Database
#We use the standard "database_name.table_name" format

print("Loading Silver Layer table from Hive..")

#A. Transaction Table
df_trans=spark.table("financial_db.transactions_silver")

#B. Users Table
df_users=spark.table("financial_db.users_silver")

#C.Cards Table
df_cards=spark.table("financial_db.cards_silver")

#3.Verification
print(f"Transaction Count:{df_trans.count()}")
print(f"Users Count: {df_users.count()}")
print(f"Cards Count: {df_cards.count()}")

#4.Preview Schema to ensure types are correct
df_trans.printSchema()
df_users.printSchema()
df_cards.printSchema()

Loading Silver Layer table from Hive..
Transaction Count:24386900
Users Count: 2000
Cards Count: 6146
root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- apart

In [6]:
df_trans.show()

+-------+-------+------+--------+--------------------+--------------+--------------+-----+----+------+--------+---------------------+----+-----+
|user_id|card_id|amount|use_chip|       merchant_name| merchant_city|merchant_state|  zip| mcc|errors|is_fraud|transaction_timestamp|year|month|
+-------+-------+------+--------+--------------------+--------------+--------------+-----+----+------+--------+---------------------+----+-----+
|    831|      1| 39.98|  Online|-2088492411650162548|        ONLINE|          null|00000|4784|   N/A|      No|  1997-07-01 11:15:00|1997|    7|
|   1773|      1| 94.60|   Swipe|-3439821459006504987|      O Fallon|            MO|63366|5411|   N/A|      No|  1997-07-01 10:38:00|1997|    7|
|    831|      1|  5.87|   Swipe|   97032797689821735|   Long Valley|            NJ|07853|5411|   N/A|      No|  1997-07-01 13:13:00|1997|    7|
|   1773|      1|  8.55|   Swipe|-4462026225108433970|      O Fallon|            MO|63366|5912|   N/A|      No|  1997-07-02 14:41: