In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [None]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("User Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType

In [None]:
#Loading the parquet file from hdfs to perform data cleaning
df = spark.read.parquet('/user/talentum/projectMaster/dataStaging/sd254_users.parquet')

In [None]:
df.printSchema()

In [None]:
#Removing '$' from the Per_Capita_Income_Zipcode column and converting it to float data type
df_cleaned = df.withColumn('Per_Capita_Income_Zipcode', F.regexp_replace(F.col('Per_Capita_Income_Zipcode'), '[$,]', ''))

In [None]:
#Removing '$' from the Yearly_Income_Person column and converting it to float data type
df_cleaned = df_cleaned.withColumn('Yearly_Income_Person', F.regexp_replace(F.col('Yearly_Income_Person'), '[$,]', ''))

In [None]:
#Removing '$' from the Total_Debt column and converting it to float data type
df_cleaned = df_cleaned.withColumn('Total_Debt', F.regexp_replace(F.col('Total_Debt'), '[$,]', ''))

In [None]:
#Casting the Per_Capita_Income_Zipcode cloumn to Float(10,2)
df_cleaned = df_cleaned.withColumn(
    'Per_Capita_Income_Zipcode',
    F.col('Per_Capita_Income_Zipcode').cast(DecimalType(10, 2))
)

In [None]:
#Casting the Yearly_Income_Person cloumn to Float(10,2)
df_cleaned = df_cleaned.withColumn(
    'Yearly_Income_Person',
    F.col('Yearly_Income_Person').cast(DecimalType(10, 2))
)

In [None]:
#Casting the Total_Debt cloumn to Float(10,2)
df_cleaned = df_cleaned.withColumn(
    'Total_Debt',
    F.col('Total_Debt').cast(DecimalType(10, 2))
)

In [None]:
#Casting zipcode to string type
df_cleaned = df_cleaned.withColumn("Zipcode", F.col("Zipcode").cast("string"))

In [None]:
#converting the columnnames to lower case as this is creating a problem when I am trying to create a hive table using partition as hive is expecting
#lower case keys

current_columns = df_cleaned.columns

new_lowercase_columns = [col.lower() for col in current_columns]

# toDF() takes an iterable (like our list) and renames the columns
df_cleaning_fixed = df_cleaning.toDF(*new_lowercase_columns)

# 4. Show the result
df_cleaning_fixed.printSchema()

In [None]:
df_cleaning_fixed.show(5)

In [None]:
#saving this cleaned parquet to a warehouseDir to be used by hive warehouse for OLAP purpose
df_cleaning_fixed.write.mode("overwrite").parquet('/user/talentum/projectMaster/warehouseDir/users')
print('Job Done !!!!')