In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import getpass

username = getpass.getuser()

In [2]:
spark = SparkSession.\
        builder.\
        config('spark.ui.port','0').\
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse').\
        config('spark.shuffle.useOldFetchProtocol','true').\
        enableHiveSupport().\
        master('yarn').\
        getOrCreate()


In [3]:
customer_schema = "member_id string , emp_title string , emp_length string , home_ownership string , annual_inc float, addr_state string , zip_code string , country string , grade string , sub_grade string , verification_status string , total_high_credit_limit float , application_type string , annual_inc_joint float , verification_status_joint string"

In [4]:
customer_raw_df = spark .read.csv("/user/itv015278/lendingclubproject/raw/customers_data_csv",header = True , schema = customer_schema) 

In [5]:
renamed_customer_raw_df = customer_raw_df.withColumnRenamed("annual_inc" , "annual_income").withColumnRenamed("addr_state","address_state").withColumnRenamed("zip_code","address_zipcode").withColumnRenamed("country","address_country").withColumnRenamed("tot_hi_cred_limit","total_high_credit_limit").withColumnRenamed("annual_inc_joint","join_annual_income")

In [6]:
transformed_customer_df = renamed_customer_raw_df.withColumn("ingest_date",current_timestamp())

In [7]:
distinct_customers = transformed_customer_df.distinct()

In [8]:
distinct_customers.createOrReplaceTempView("customers")

In [9]:
customer_income_filtered = spark.sql("select * from customers where annual_income is NOT NULL")

In [10]:
customer_income_filtered.createOrReplaceTempView("customers")

In [11]:
customer_emp_length_cleaned = customer_income_filtered.withColumn("emp_length",regexp_replace(col("emp_length") , "(\D)" , "" ))

In [12]:
customer_emplength_casted =  customer_emp_length_cleaned.withColumn("emp_length",customer_emp_length_cleaned.emp_length.cast('int'))

In [13]:
customer_emplength_casted.createOrReplaceTempView("customers")

In [14]:
spark.sql("select floor(avg(emp_length)) as average_emp_length from customers")

average_emp_length
6


In [15]:
avg_emp_length = customer_emplength_casted.select(floor(avg("emp_length"))).first()[0]

In [16]:
customer_emplength_filled = customer_emplength_casted.withColumn("emp_length" , when(col("emp_length").isNull() , avg_emp_length).otherwise(col("emp_length"))  )

In [17]:
customer_clean_address = customer_emplength_filled.withColumn("address_state" , when( length(col("address_state")) < 3 , col("address_state") ).otherwise("NA"))

In [18]:
customer_clean_address.write.mode("overwrite").option("path","/user/itv015278/lendingclubproject/cleaned/customers_parquet").save()