In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Cards Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
#Loading the parquet file from the hdfs file system
df=spark.read.orc("/user/talentum/projectMaster/dataStaging/sd254_cards.parquet")

In [4]:
#Viewing the first few records from the file
df.head(5)

[Row(User=0, CARD_INDEX=0, Card_Brand='Visa', Card_Type='Debit', Card_Number='4344676511950444', Expires='12/2022', CVV='623', Has_Chip='YES', Cards_Issued=2, Credit_Limit='$24295', Acct_Open_Date='09/2002', Year_PIN_Last_Changed=2008, Card_on_Dark_Web='No'),
 Row(User=0, CARD_INDEX=1, Card_Brand='Visa', Card_Type='Debit', Card_Number='4956965974959986', Expires='12/2020', CVV='393', Has_Chip='YES', Cards_Issued=2, Credit_Limit='$21968', Acct_Open_Date='04/2014', Year_PIN_Last_Changed=2014, Card_on_Dark_Web='No'),
 Row(User=0, CARD_INDEX=2, Card_Brand='Visa', Card_Type='Debit', Card_Number='4582313478255491', Expires='02/2024', CVV='719', Has_Chip='YES', Cards_Issued=2, Credit_Limit='$46414', Acct_Open_Date='07/2003', Year_PIN_Last_Changed=2004, Card_on_Dark_Web='No'),
 Row(User=0, CARD_INDEX=3, Card_Brand='Visa', Card_Type='Credit', Card_Number='4879494103069057', Expires='08/2024', CVV='693', Has_Chip='NO', Cards_Issued=1, Credit_Limit='$12400', Acct_Open_Date='01/2003', Year_PIN_Las

In [5]:
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType, IntegerType

In [6]:
from pyspark.sql.types import DecimalType, IntegerType, StringType, DoubleType,LongType
from pyspark.sql import functions as F

# Define the strict cleaning logic 
def clean_and_cast_decimal(col_name, scale=2):
    return F.regexp_replace(
        F.col(col_name).cast(StringType()), 
        "[^0-9\\.]", "" 
    ).cast(DecimalType(10, scale))

# FIX: Use column names exactly as they appear in the error message (Underscores!)
df_cleaning_fixed = df.select(
    F.col("User").cast(IntegerType()).alias("user"),
    # Error said 'CARD_INDEX', so we use that
    F.col("CARD_INDEX").cast(IntegerType()).alias("card_index"), 
    F.col("Card_Brand").alias("card_brand"),
    F.col("Card_Type").alias("card_type"),
    F.col("Card_Number").cast(LongType()).alias("card_number"),
    F.col("Expires").alias("expires"),
    F.col("CVV").cast(IntegerType()).alias("cvv"),
    F.col("Has_Chip").alias("has_chip"),
    F.col("Cards_Issued").cast(IntegerType()).alias("cards_issued"),
    
    # FIX: Use "Credit_Limit" (underscore)
    clean_and_cast_decimal("Credit_Limit").alias("credit_limit"),
    
    F.col("Year_PIN_Last_Changed").cast(IntegerType()).alias("year_pin_last_changed"),
    F.col("Card_on_Dark_Web").alias("card_on_dark_web"),
    
    # Fix dates using Underscores: Acct_Open_Date
    F.month(F.to_date(F.col("Acct_Open_Date"), "MM/yyyy")).cast(IntegerType()).alias("acct_opened_month"),
    F.year(F.to_date(F.col("Acct_Open_Date"), "MM/yyyy")).cast(IntegerType()).alias("acct_opened_year"),
    
    F.month(F.to_date(F.col("Expires"), "MM/yyyy")).cast(IntegerType()).alias("expires_month"),
    F.year(F.to_date(F.col("Expires"), "MM/yyyy")).cast(IntegerType()).alias("expires_year")
)

# Save logic (Cell 15)
# df_cleaning_fixed.write.mode('overwrite').orc('/user/talentum/projectMaster/warehouseDir/cards')

In [7]:
#saving the cleaned data to warehouseDir to be used hive for OLAP
df_cleaning_fixed.write.mode('overwrite').orc('/user/talentum/projectMaster/warehouseDir/cards')
print('Job Done!!!')

Job Done!!!
