In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("User Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType

In [4]:
#Loading the parquet file from hdfs to perform data cleaning
df = spark.read.parquet('/user/talentum/projectMaster/dataStaging/sd254_users.parquet')

In [5]:
df.printSchema()

root
 |-- Person_ID: string (nullable = true)
 |-- Current_Age: integer (nullable = true)
 |-- Retirement_Age: integer (nullable = true)
 |-- Birth_Year: integer (nullable = true)
 |-- Birth_Month: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Apartment: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Per_Capita_Income_Zipcode: string (nullable = true)
 |-- Yearly_Income_Person: string (nullable = true)
 |-- Total_Debt: string (nullable = true)
 |-- FICO_Score: integer (nullable = true)
 |-- Num_Credit_Cards: integer (nullable = true)



In [6]:
#Removing '$' from the Per_Capita_Income_Zipcode column and converting it to float data type
df_cleaned = df.withColumn('Per_Capita_Income_Zipcode', F.regexp_replace(F.col('Per_Capita_Income_Zipcode'), '[$,]', ''))

In [7]:
#Removing '$' from the Yearly_Income_Person column and converting it to float data type
df_cleaned = df_cleaned.withColumn('Yearly_Income_Person', F.regexp_replace(F.col('Yearly_Income_Person'), '[$,]', ''))

In [8]:
#Removing '$' from the Total_Debt column and converting it to float data type
df_cleaned = df_cleaned.withColumn('Total_Debt', F.regexp_replace(F.col('Total_Debt'), '[$,]', ''))

In [9]:
#Casting the Per_Capita_Income_Zipcode cloumn to Float(10,2)
df_cleaned = df_cleaned.withColumn(
    'Per_Capita_Income_Zipcode',
    F.col('Per_Capita_Income_Zipcode').cast(DecimalType(10, 2))
)

In [10]:
#Casting the Yearly_Income_Person cloumn to Float(10,2)
df_cleaned = df_cleaned.withColumn(
    'Yearly_Income_Person',
    F.col('Yearly_Income_Person').cast(DecimalType(10, 2))
)

In [11]:
#Casting the Total_Debt cloumn to Float(10,2)
df_cleaned = df_cleaned.withColumn(
    'Total_Debt',
    F.col('Total_Debt').cast(DecimalType(10, 2))
)

In [12]:
#Casting zipcode to string type
df_cleaned = df_cleaned.withColumn("Zipcode", F.col("Zipcode").cast("string"))

In [13]:
df_cleaned.show(5)

+--------------+-----------+--------------+----------+-----------+------+--------------------+---------+-------------+-----+-------+--------+---------+-------------------------+--------------------+----------+----------+----------------+
|     Person_ID|Current_Age|Retirement_Age|Birth_Year|Birth_Month|Gender|             Address|Apartment|         City|State|Zipcode|Latitude|Longitude|Per_Capita_Income_Zipcode|Yearly_Income_Person|Total_Debt|FICO_Score|Num_Credit_Cards|
+--------------+-----------+--------------+----------+-----------+------+--------------------+---------+-------------+-----+-------+--------+---------+-------------------------+--------------------+----------+----------+----------------+
|Hazel Robinson|         53|            66|      1966|         11|Female|       462 Rose Lane|     null|     La Verne|   CA|  91750|   34.15|  -117.76|                 29278.00|            59696.00| 127613.00|       787|               5|
|    Sasha Sadr|         53|            68|     

In [14]:
#saving this cleaned parquet to a warehouseDir to be used by hive warehouse for OLAP purpose
df_cleaned.write.mode("overwrite").parquet('/user/talentum/projectMaster/warehouseDir/users')
print('Job Done !!!!')

Job Done !!!!
