In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Transaction Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
import pyspark.sql.functions as F

In [4]:
#Loading the parquet file from hdfs to perform data cleaning
df = spark.read.parquet('/user/talentum/projectMaster/dataStaging/credit_card_transactions-ibm_v2.parquet')

In [5]:
#Removing '$' from the amount column and converting it to float data type
df_cleaning = df.withColumn('Amount', F.regexp_replace(F.col('Amount'), '\\$', ''))

In [6]:
#Removing word Transaction from the field Use_Chip
df_cleaning = df_cleaning.withColumn('Use_Chip', F.regexp_replace(F.col('Use_Chip'), ' Transaction', ''))

In [7]:
#Replacing null with a placeholder 'N/A'
df_cleaning = df_cleaning.fillna('N/A', subset=['Errors?'])

In [8]:
df_cleaning.show(5)

+----+----+----+-----+---+-----+-------+--------+--------------------+--------------+--------------+-------+----+-------+---------+
|User|Card|Year|Month|Day| Time| Amount|Use_Chip|       Merchant_Name| Merchant_City|Merchant_State|    Zip| MCC|Errors?|Is_Fraud?|
+----+----+----+-----+---+-----+-------+--------+--------------------+--------------+--------------+-------+----+-------+---------+
| 591|   3|2016|    8| 13|10:54|-415.00|    Chip| 4552887027432897467|       Oakland|            CA|94606.0|3596|    N/A|       No|
| 591|   3|2016|    8| 16|13:33|  22.37|    Chip|-8964802287130046767|        Tucker|            GA|30084.0|7230|    N/A|       No|
| 591|   3|2016|    8| 19|14:38|  10.87|    Chip|   97032797689821735|Southern Pines|            NC|28387.0|5411|    N/A|       No|
| 591|   3|2016|    8| 20|10:11|  73.84|    Chip|-5401953891366957779|       Shannon|            NC|28386.0|5651|    N/A|       No|
| 591|   3|2016|    8| 20|15:32|  38.50|    Chip|-2472481739355111587|   Sai

In [11]:
#saving this cleaned parquet to a warehouseDir to be used by hive warehouse for OLAP purpose
df_cleaning.write.mode("overwrite").parquet('/user/talentum/projectMaster/warehouseDir/transactions')
print('Job Done !!!!')

Job Done !!!!
