In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("User Data Cleaning").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import DecimalType

In [4]:
#Loading the parquet file from hdfs to perform data cleaning
df = spark.read.orc('/user/talentum/projectMaster/dataStaging/sd254_users.parquet')

In [5]:
df.printSchema()

root
 |-- Person_ID: string (nullable = true)
 |-- Current_Age: integer (nullable = true)
 |-- Retirement_Age: integer (nullable = true)
 |-- Birth_Year: integer (nullable = true)
 |-- Birth_Month: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Apartment: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Per_Capita_Income_Zipcode: string (nullable = true)
 |-- Yearly_Income_Person: string (nullable = true)
 |-- Total_Debt: string (nullable = true)
 |-- FICO_Score: integer (nullable = true)
 |-- Num_Credit_Cards: integer (nullable = true)



In [6]:
from pyspark.sql.types import DecimalType, IntegerType, StringType, DoubleType
from pyspark.sql import functions as F

# Define the strict cleaning logic
def clean_and_cast_decimal(col_name):
    """
    Strips non-numeric characters and casts safely to Decimal.
    """
    return F.regexp_replace(
        F.col(col_name).cast(StringType()), 
        "[^0-9\\.\\-]", "" 
    ).cast(DecimalType(10, 2))

# FIX: Use Underscores (_) exactly as shown in the error message
df_fixed = df.select(
    F.col("Person_ID").alias("person_id"),
    F.col("Current_Age").cast(IntegerType()).alias("current_age"),
    F.col("Retirement_Age").cast(IntegerType()).alias("retirement_age"),
    F.col("Birth_Year").cast(IntegerType()).alias("birth_year"),
    F.col("Birth_Month").cast(IntegerType()).alias("birth_month"),
    F.col("Gender").alias("gender"),
    F.col("Address").alias("address"),
    F.col("Apartment").alias("apartment"),
    F.col("City").alias("city"),
    F.col("State").alias("state"),
    F.col("Zipcode").alias("zipcode"),
    F.col("Latitude").cast(DoubleType()).alias("latitude"),
    F.col("Longitude").cast(DoubleType()).alias("longitude"),

    # FIX: Use proper column names with underscores
    clean_and_cast_decimal("Per_Capita_Income_Zipcode").alias("per_capita_income_zipcode"),
    clean_and_cast_decimal("Yearly_Income_Person").alias("yearly_income_person"),
    clean_and_cast_decimal("Total_Debt").alias("total_debt"),
     
    F.col("FICO_Score").cast(IntegerType()).alias("fico_score"),
    F.col("Num_Credit_Cards").cast(IntegerType()).alias("num_credit_cards")

)

# Save logic (Cell 17)
# df_fixed.write.mode("overwrite").orc('/user/talentum/projectMaster/warehouseDir/users')





In [7]:
df_fixed.printSchema()

root
 |-- person_id: string (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- apartment: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- per_capita_income_zipcode: decimal(10,2) (nullable = true)
 |-- yearly_income_person: decimal(10,2) (nullable = true)
 |-- total_debt: decimal(10,2) (nullable = true)
 |-- fico_score: integer (nullable = true)
 |-- num_credit_cards: integer (nullable = true)



In [8]:
#saving this cleaned parquet to a warehouseDir to be used by hive warehouse for OLAP purpose
df_fixed.write.mode("overwrite").orc('/user/talentum/projectMaster/warehouseDir/users')
print('Job Done !!!!')

Job Done !!!!
