In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Transaction Data Cleaning").enableHiveSupport().getOrCreate()
spark.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS")
# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType

In [4]:
#Loading the parquet file from hdfs to perform data cleaning
df = spark.read.orc('/user/talentum/projectMaster/dataStaging/credit_card_transactions-ibm_v2.parquet')

In [5]:
#Removing '$' from the amount column and converting it to float data type
df_cleaning = df.withColumn('Amount', F.regexp_replace(F.col('Amount'), '[$,]', ''))

In [6]:
#Casting the Amount cloumn to Float(10,2)
df_cleaned = df_cleaning.withColumn(
    'Amount',
    F.col('Amount').cast(DecimalType(10, 2))
)

In [7]:
#Removing word Transaction from the field Use_Chip
df_cleaning = df_cleaning.withColumn('Use_Chip', F.regexp_replace(F.col('Use_Chip'), ' Transaction', ''))

In [8]:
#Replacing null with a placeholder 'N/A'
df_cleaning = df_cleaning.fillna('N/A', subset=['Errors?'])

In [9]:
#Casting Zip to string type
df_cleaning = df_cleaning.withColumn("Zip", 
    F.coalesce(
        # Robust conversion: Cast float to int to safely remove .0, then pad to 5 digits
        F.lpad(F.col("Zip").cast("float").cast("int").cast("string"), 5, "0"), 
        F.lit("00000") # Fill Null/Missing Zip codes with '00000'
    )
)

In [10]:
#creating a Date_str column to also have consolidated dates
df_cleaning = df_cleaning.withColumn(
    # Create the date string: YYYY-MM-DD
    "Date_Str",
    F.concat_ws("-",
        F.col("Year"),
        # Pad Month and Day to ensure 2 digits (e.g., 8 -> 08) for standard format
        F.lpad(F.col("Month").cast("string"), 2, "0"),
        F.lpad(F.col("Day").cast("string"), 2, "0")
    )
).withColumn(
    # Combine Date_Str and Time into a single TIMESTAMP object
    "Transaction_Timestamp",
    F.to_timestamp(F.concat(F.col("Date_Str"), F.lit(" "), F.col("Time")), "yyyy-MM-dd HH:mm")
).drop("Day", "Time", "Date_Str") # Drop original raw columns

#Converting merchant name to string from long
df_cleaning = df_cleaning.withColumn(
    "Merchant_Name", 
    F.col("Merchant_Name").cast("string")
)

# Iterate through every column in the DataFrame
for old_name in df_cleaning.columns:
    
    # 1. Clean the name (removes the tricky '?' from Errors? and Is_Fraud?)
    cleaned_name = old_name.replace('?', '')
    
    # 2. Lowercase the entire name (e.g., 'User' -> 'user', 'Merchant_Name' -> 'merchant_name')
    new_name = cleaned_name.lower()
    
    # 3. Apply the rename
    df_cleaning = df_cleaning.withColumnRenamed(old_name, new_name)

In [11]:
df_cleaning.show(5)

+----+----+----+-----+-------+--------+--------------------+--------------+--------------+-----+----+------+--------+---------------------+
|user|card|year|month| amount|use_chip|       merchant_name| merchant_city|merchant_state|  zip| mcc|errors|is_fraud|transaction_timestamp|
+----+----+----+-----+-------+--------+--------------------+--------------+--------------+-----+----+------+--------+---------------------+
| 591|   3|2016|    8|-415.00|    Chip| 4552887027432897467|       Oakland|            CA|94606|3596|   N/A|      No|  2016-08-13 10:54:00|
| 591|   3|2016|    8|  22.37|    Chip|-8964802287130046767|        Tucker|            GA|30084|7230|   N/A|      No|  2016-08-16 13:33:00|
| 591|   3|2016|    8|  10.87|    Chip|   97032797689821735|Southern Pines|            NC|28387|5411|   N/A|      No|  2016-08-19 14:38:00|
| 591|   3|2016|    8|  73.84|    Chip|-5401953891366957779|       Shannon|            NC|28386|5651|   N/A|      No|  2016-08-20 10:11:00|
| 591|   3|2016|    

In [12]:
df_cleaning.printSchema()

root
 |-- user: integer (nullable = true)
 |-- card: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- amount: string (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = false)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = false)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)



In [13]:
df_final_prep = df_cleaning.withColumn(
    "amount",
    F.regexp_replace(F.col("amount").cast("string"), "[^0-9\\.\\-]", "").cast(DecimalType(10, 2))
)

# --- FIX 2: Explicit Column Selection & Renaming ---
# We map 'user' -> 'user_id' and 'card' -> 'card_id' to match the Hive Table Schema.
df_export = df_final_prep.select(
    F.col("user").alias("user_id"),       # <--- CRITICAL FIX
    F.col("card").alias("card_id"),       # <--- CRITICAL FIX
    F.col("amount"),
    F.col("use_chip"),
    F.col("merchant_name"),
    F.col("merchant_city"),
    F.col("merchant_state"),
    F.col("zip"),
    F.col("mcc"),
    F.col("errors"),
    F.col("is_fraud"),
    F.col("transaction_timestamp"),
    F.col("year"),   # Required for partitioning
    F.col("month")   # Required for partitioning
)

# Debug: Verify the schema matches Hive (user_id, card_id, etc.)
print("Final Schema for Export:")
df_export.printSchema()

# --- Save to Warehouse ---
df_export.write.mode("overwrite") \
    .partitionBy("year", "month") \
    .format("orc") \
    .save("/user/talentum/projectMaster/warehouseDir/transactions")

print('Job Done !!!! Transactions table saved with correct column names.')



Final Schema for Export:
root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = false)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = false)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

Job Done !!!! Transactions table saved with correct column names.


In [14]:
check_df = spark.read.orc("/user/talentum/projectMaster/warehouseDir/transactions")
check_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

