In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import getpass

username = getpass.getuser()

In [51]:
spark = SparkSession.\
        builder.\
        config('spark.ui.port','0').\
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse').\
        config('spark.shuffle.useOldFetchProtocol','true').\
        enableHiveSupport().\
        master('yarn').\
        getOrCreate()


In [52]:
loans_repay_schema = "loan_id string ,total_principal_received float,total_interest_received float,total_late_fee_received float,total_payment_received float,last_payment_amount float,last_payment_date string ,next_payment_date string"

In [53]:
loans_repay_raw_df = spark .read.csv("/user/itv015278/lendingclubproject/raw/loans_repayments_csv",header = True , schema = loans_repay_schema) 

In [54]:
loans_repay_date_ingested = loans_repay_raw_df.withColumn("ingest_date" , current_timestamp())

In [55]:
columnsToBeConsidered = ["total_principal_received","total_interest_received","total_late_fee_received","total_payment_received","last_payment_amount"]

In [56]:
filtered_loans_repay_df = loans_repay_date_ingested.na.drop(subset=columnsToBeConsidered)

In [57]:
loan_repay_amount_sumed_up = filtered_loans_repay_df.withColumn("total_payment_received" , when( 
(col("total_payment_received") == 0) & (col("total_principal_received") > 0) , col('total_principal_received')+col("total_interest_received")+col("total_late_fee_received")
).otherwise(col("total_payment_received")))


In [58]:
cleaned_loan_repay = loan_repay_amount_sumed_up.filter("total_payment_received > 0.0")

In [59]:
cleaned_loan_repay_for_dates_df = cleaned_loan_repay.withColumn("last_payment_date" , when( 
(col("last_payment_date") == 0.0) , None
).otherwise(col("last_payment_date"))).withColumn("next_payment_date" , when( 
(col("next_payment_date") == 0.0) , None
).otherwise(col("next_payment_date")))


In [60]:
cleaned_loan_repay_for_dates_df.write.mode("overwrite").option("path","/user/itv015278/lendingclubproject/cleaned/loans_repayments_parquet").save()