In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [None]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Cards Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [None]:
#Loading the parquet file from the hdfs file system
df=spark.read.parquet("/user/talentum/projectMaster/dataStaging/sd254_cards.parquet")

In [None]:
#Viewing the first few records from the file
df.head(5)

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType, IntegerType

In [None]:
#Removing the '$' from the Credit_Limit field
df_cleaning=df.withColumn('Credit_Limit',
    F.regexp_replace(F.col("Credit_Limit"), "[$,]", "").cast(DecimalType(10, 2))
)

In [None]:
df_cleaning.show()

In [None]:
#splitting the Acct_Open_Date field in month and year col
df_cleaning = df_cleaning.withColumn(
    "Acct_Open_Date", 
    F.split(F.col("Acct_Open_Date"), "/") 
).withColumn(
    "Acct_Opened_month", 
    F.col("Acct_Open_Date").getItem(0).cast(IntegerType() 
)).withColumn(
    "Acct_Opened_year", 
    F.col("Acct_Open_Date").getItem(1).cast(IntegerType()
)).drop("Acct_Open_Date")

In [None]:
df_cleaning.show()

In [None]:
#splitting the Expires field in month and year col
df_cleaning = df_cleaning.withColumn(
    "Expires", 
    F.split(F.col("Expires"), "/") 
).withColumn(
    "Expires_month", 
    F.col("Expires").getItem(0).cast(IntegerType() 
)).withColumn(
    "Expires_year", 
    F.col("Expires").getItem(1).cast(IntegerType()
)).drop("Expires")

In [None]:
df_cleaning = df_cleaning.fillna({'Credit_Limit': 0.0})

In [None]:
df_cleaning.show()

In [None]:
#converting the columnnames to lower case as this is creating a problem when I am trying to create a hive table using partition as hive is expecting
#lower case keys

current_columns = df_cleaning.columns

new_lowercase_columns = [col.lower() for col in current_columns]

# toDF() takes an iterable (like our list) and renames the columns
df_cleaning_fixed = df_cleaning.toDF(*new_lowercase_columns)

# 4. Show the result
df_cleaning_fixed.printSchema()

In [None]:
df_cleaning_fixed.show()

In [None]:
#saving the cleaned data to warehouseDir to be used hive for OLAP
df_cleaning_fixed.write.mode('overwrite').parquet('/user/talentum/projectMaster/warehouseDir/cards')
print('Job Done!!!')