In [1]:
# Intialization
import os
import sys
import glob
import shutil

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create_spend_analysis_dataMart").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
print("Loading Gold layer data mart tables from Hive...")

#Load the tables directly from the database
# We use the standard 'database_name.table_name' format
df = spark.table("financial_db.spend_analysis_gold")

# 3. Quick Verification
print(f"datamart rows Count: {df.count()}")

df.printSchema()



Loading Gold layer data mart tables from Hive...
datamart rows Count: 10026790
root
 |-- txn_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- age_group: string (nullable = true)
 |-- income_tier: string (nullable = true)
 |-- user_gender: string (nullable = true)
 |-- card_brand: string (nullable = true)
 |-- card_type: string (nullable = true)
 |-- spend_location_type: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- total_spend_amount: decimal(20,2) (nullable = true)
 |-- transaction_count: long (nullable = true)
 |-- avg_ticket_size: decimal(14,6) (nullable = true)
 |-- fraud_spend_amount: decimal(22,2) (nullable = true)
 |-- max_transaction_val: decimal(10,2) (nullable = true)



In [4]:
#2 Define Output Paths (Local Linux File System)
#Using 'file://' to force spark to write tolocal disk instead of HDFS

temp_output_folder='file:///home/talentum/projectMaster/dataStaging/temp_orc_folder'
local_folder_path = '/home/talentum/projectMaster/dataStaging/temp_orc_folder'
final_output_file='/home/talentum/projectMaster/dataStaging/spend_analysis_final.orc'

#Clean up previous runs if they exists:
if os.path.exists(final_output_file):
    os.remove(final_output_file)
    
os.makedirs(local_folder_path,exist_ok=True)

#3.Coalesce and write
#coalesce(1) forces all data into a single partition
print("writing the orc file...")
df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .orc(temp_output_folder)

#4. Rename the output folder
list_of_files = glob.glob(f"{local_folder_path}/part-*.orc")

if list_of_files:
    actual_file = list_of_files[0]
    #Move and Rename
    shutil.move(actual_file, final_output_file)
    shutil.rmtree(local_folder_path)
    print(f"Success! Single file is ready at: {final_output_file}")
else:
    print('Error: Could not find the generated ORC file.')

writing the orc file...
Success! Single file is ready at: /home/talentum/projectMaster/dataStaging/spend_analysis_final.orc
