In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Cards Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
#Loading the parquet file from the hdfs file system
df=spark.read.parquet("/user/talentum/projectMaster/dataStaging/sd254_cards.parquet")

In [4]:
#Viewing the first few records from the file
df.head(5)

[Row(User=0, CARD_INDEX=0, Card_Brand='Visa', Card_Type='Debit', Card_Number='4344676511950444', Expires='12/2022', CVV='623', Has_Chip='YES', Cards_Issued=2, Credit_Limit='$24295', Acct_Open_Date='09/2002', Year_PIN_Last_Changed=2008, Card_on_Dark_Web='No'),
 Row(User=0, CARD_INDEX=1, Card_Brand='Visa', Card_Type='Debit', Card_Number='4956965974959986', Expires='12/2020', CVV='393', Has_Chip='YES', Cards_Issued=2, Credit_Limit='$21968', Acct_Open_Date='04/2014', Year_PIN_Last_Changed=2014, Card_on_Dark_Web='No'),
 Row(User=0, CARD_INDEX=2, Card_Brand='Visa', Card_Type='Debit', Card_Number='4582313478255491', Expires='02/2024', CVV='719', Has_Chip='YES', Cards_Issued=2, Credit_Limit='$46414', Acct_Open_Date='07/2003', Year_PIN_Last_Changed=2004, Card_on_Dark_Web='No'),
 Row(User=0, CARD_INDEX=3, Card_Brand='Visa', Card_Type='Credit', Card_Number='4879494103069057', Expires='08/2024', CVV='693', Has_Chip='NO', Cards_Issued=1, Credit_Limit='$12400', Acct_Open_Date='01/2003', Year_PIN_Las

In [5]:
import pyspark.sql.functions as F

In [6]:
#Removing the '$' from the Credit_Limit field
df_cleaning=df.withColumn('Credit_Limit',F.regexp_replace(F.col("Credit_Limit"), r"\$", ""))

In [7]:
df_cleaning.show()

+----+----------+----------+---------------+----------------+-------+---+--------+------------+------------+--------------+---------------------+----------------+
|User|CARD_INDEX|Card_Brand|      Card_Type|     Card_Number|Expires|CVV|Has_Chip|Cards_Issued|Credit_Limit|Acct_Open_Date|Year_PIN_Last_Changed|Card_on_Dark_Web|
+----+----------+----------+---------------+----------------+-------+---+--------+------------+------------+--------------+---------------------+----------------+
|   0|         0|      Visa|          Debit|4344676511950444|12/2022|623|     YES|           2|       24295|       09/2002|                 2008|              No|
|   0|         1|      Visa|          Debit|4956965974959986|12/2020|393|     YES|           2|       21968|       04/2014|                 2014|              No|
|   0|         2|      Visa|          Debit|4582313478255491|02/2024|719|     YES|           2|       46414|       07/2003|                 2004|              No|
|   0|         3|     

In [8]:
#splitting the Acct_Open_Date field in month and year col
df_cleaning = df_cleaning.withColumn(
    "Acct_Open_Date", 
    F.split(F.col("Acct_Open_Date"), "/") 
).withColumn(
    "Acct_Opened_month", 
    F.col("Acct_Open_Date").getItem(0) 
).withColumn(
    "Acct_Opened_year", 
    F.col("Acct_Open_Date").getItem(1)
).drop("Acct_Open_Date")

In [9]:
df_cleaning.show()

+----+----------+----------+---------------+----------------+-------+---+--------+------------+------------+---------------------+----------------+-----------------+----------------+
|User|CARD_INDEX|Card_Brand|      Card_Type|     Card_Number|Expires|CVV|Has_Chip|Cards_Issued|Credit_Limit|Year_PIN_Last_Changed|Card_on_Dark_Web|Acct_Opened_month|Acct_Opened_year|
+----+----------+----------+---------------+----------------+-------+---+--------+------------+------------+---------------------+----------------+-----------------+----------------+
|   0|         0|      Visa|          Debit|4344676511950444|12/2022|623|     YES|           2|       24295|                 2008|              No|               09|            2002|
|   0|         1|      Visa|          Debit|4956965974959986|12/2020|393|     YES|           2|       21968|                 2014|              No|               04|            2014|
|   0|         2|      Visa|          Debit|4582313478255491|02/2024|719|     YES|   

In [10]:
#splitting the Expires field in month and year col
df_cleaning = df_cleaning.withColumn(
    "Expires", 
    F.split(F.col("Expires"), "/") 
).withColumn(
    "Expires_month", 
    F.col("Expires").getItem(0) 
).withColumn(
    "Expires_year", 
    F.col("Expires").getItem(1)
).drop("Expires")

In [11]:
df_cleaning.show()

+----+----------+----------+---------------+----------------+---+--------+------------+------------+---------------------+----------------+-----------------+----------------+-------------+------------+
|User|CARD_INDEX|Card_Brand|      Card_Type|     Card_Number|CVV|Has_Chip|Cards_Issued|Credit_Limit|Year_PIN_Last_Changed|Card_on_Dark_Web|Acct_Opened_month|Acct_Opened_year|Expires_month|Expires_year|
+----+----------+----------+---------------+----------------+---+--------+------------+------------+---------------------+----------------+-----------------+----------------+-------------+------------+
|   0|         0|      Visa|          Debit|4344676511950444|623|     YES|           2|       24295|                 2008|              No|               09|            2002|           12|        2022|
|   0|         1|      Visa|          Debit|4956965974959986|393|     YES|           2|       21968|                 2014|              No|               04|            2014|           12|    

In [12]:
#saving the cleaned data to warehouseDir to be used hive for OLAP
df_cleaning.write.mode('overwrite').parquet('/user/talentum/projectMaster/warehouseDir/cards')
print('Job Done!!!')

Job Done!!!
