In [2]:
from pyspark.sql import SparkSession
import getpass 
username=getpass.getuser()
spark=SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [3]:
loans_repay_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema", True) \
.load("/user/itv014325/lendingclubproject/raw/loans_repayments_csv")

In [4]:
loans_repay_raw_df

loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
56633077,3000.0,376.21,0.0,3376.205975527,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.320693408998,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.9398096902,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.0146429476,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026276051,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.673055266598,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [5]:
loans_repay_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_rec_prncp: string (nullable = true)
 |-- total_rec_int: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- total_pymnt: string (nullable = true)
 |-- last_pymnt_amnt: string (nullable = true)
 |-- last_pymnt_d: string (nullable = true)
 |-- next_pymnt_d: string (nullable = true)



In [6]:
loans_repay_schema = 'loan_id string, total_principal_received float, total_interest_received float, total_late_fee_received float, total_payment_received float, last_payment_amount float, last_payment_date string, next_payment_date string'

In [12]:
loans_repay_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(loans_repay_schema) \
.load("/user/itv014325/lendingclubproject/raw/loans_repayments_csv")

In [13]:
loans_repay_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [14]:
from pyspark.sql.functions import current_timestamp

In [15]:
loans_repay_df_ingestd = loans_repay_raw_df.withColumn("ingest_date", current_timestamp())

In [16]:
loans_repay_df_ingestd

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,,2024-11-28 07:05:...
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,,2024-11-28 07:05:...
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,,2024-11-28 07:05:...
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,,2024-11-28 07:05:...
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,,2024-11-28 07:05:...
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,,2024-11-28 07:05:...
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,,2024-11-28 07:05:...
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019,2024-11-28 07:05:...
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,,2024-11-28 07:05:...
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,,2024-11-28 07:05:...


In [18]:
loans_repay_df_ingestd.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [19]:
loans_repay_df_ingestd.count()

2260701

In [20]:
loans_repay_df_ingestd.createOrReplaceTempView("loan_repayments")

In [21]:
spark.sql("select count(*) from loan_repayments where total_principal_received is null")

count(1)
69


In [22]:
columns_to_check = ["total_principal_received", "total_interest_received", "total_late_fee_received", "total_payment_received", "last_payment_amount"]

In [23]:
loans_repay_filtered_df = loans_repay_df_ingestd.na.drop(subset=columns_to_check)

In [24]:
loans_repay_filtered_df.count()

2260498

In [25]:
loans_repay_filtered_df.createOrReplaceTempView("loan_repayments")

In [26]:
spark.sql("select count(*) from loan_repayments where total_payment_received = 0.0")

count(1)
995


In [27]:
spark.sql("select count(*) from loan_repayments where total_payment_received = 0.0 and total_principal_received != 0.0")

count(1)
46


In [28]:
spark.sql("select * from loan_repayments where total_payment_received = 0.0 and total_principal_received != 0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
485818,14640.096,13388.84,13000.0,0.0,0.0,0.0,Mar-2013,2024-11-28 07:12:...
485471,29620.818,29134.64,25000.0,0.0,0.0,0.0,Mar-2013,2024-11-28 07:12:...
482256,8735.611,7479.87,8000.0,0.0,0.0,0.0,Feb-2011,2024-11-28 07:12:...
478160,410.0,407.36,0.0,0.0,143.1,410.0,,2024-11-28 07:12:...
476557,28865.18,24164.67,5692.31,0.0,6972.59,19916.78,Dec-2010,2024-11-28 07:12:...
472516,25951.482,24731.76,25000.0,0.0,0.0,0.0,May-2010,2024-11-28 07:12:...
472197,12048.13,12018.01,10000.0,0.0,0.0,0.0,Jan-2013,2024-11-28 07:12:...
467364,29216.791,29066.19,24250.0,0.0,0.0,0.0,Dec-2012,2024-11-28 07:12:...
399499,26557.729,26336.41,24000.0,0.0,0.0,0.0,Dec-2010,2024-11-28 07:12:...
451482,7587.5513,7587.55,7000.0,0.0,0.0,0.0,Jan-2011,2024-11-28 07:12:...


In [29]:
from pyspark.sql.functions import when, col

In [30]:
loans_payments_fixed_df = loans_repay_filtered_df.withColumn(
   "total_payment_received",
    when(
        (col("total_principal_received") != 0.0) &
        (col("total_payment_received") == 0.0),
        col("total_principal_received") + col("total_interest_received") + col("total_late_fee_received")
    ).otherwise(col("total_payment_received"))
)

In [31]:
loans_payments_fixed_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,,2024-11-28 07:15:...
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,,2024-11-28 07:15:...
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,,2024-11-28 07:15:...
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,,2024-11-28 07:15:...
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,,2024-11-28 07:15:...
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,,2024-11-28 07:15:...
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,,2024-11-28 07:15:...
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019,2024-11-28 07:15:...
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,,2024-11-28 07:15:...
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,,2024-11-28 07:15:...


In [33]:
loans_payments_fixed_df.createOrReplaceTempView("loans_payments_fixed")

In [34]:
spark.sql("select count(*) from loans_payments_fixed where total_payment_received = 0.0 and total_principal_received != 0.0")

count(1)
0


In [25]:
loans_payments_fixed_df.filter("loan_id == '1064185'")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
1064185,11600.98,11600.98,10000.0,33201.96,0.0,0.0,Dec-2014,2023-10-04 05:40:...


In [37]:
loans_payments_fixed2_df = loans_payments_fixed_df.filter("total_payment_received != 0.0")

In [38]:
loans_payments_fixed2_df.filter("last_payment_date = 0.0").count()

48

In [39]:
loans_payments_fixed2_df.filter("next_payment_date = 0.0").count()

24

In [40]:
loans_payments_fixed2_df.filter("last_payment_date is null").count()

1477

In [41]:
loans_payments_fixed2_df.filter("next_payment_date is null").count()

1344240

In [42]:
loans_payments_ldate_fixed_df = loans_payments_fixed2_df.withColumn(
  "last_payment_date",
   when(
       (col("last_payment_date") == 0.0),
       None
       ).otherwise(col("last_payment_date"))
)

In [47]:
loans_payments_ndate_fixed_df = loans_payments_ldate_fixed_df.withColumn(
  "next_payment_date",
   when(
       (col("next_payment_date") == 0.0),
       None
       ).otherwise(col("next_payment_date"))
)

In [48]:
loans_payments_ndate_fixed_df.filter("last_payment_date = 0.0").count()

0

In [49]:
loans_payments_ndate_fixed_df.filter("next_payment_date = 0.0").count()

0

In [50]:
loans_payments_ndate_fixed_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv014325/lendingclubproject/raw/cleaned/loans_repayments_parquet") \
.save()

In [51]:
loans_payments_ndate_fixed_df.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv014325/lendingclubproject/raw/cleaned/loans_repayments_csv") \
.save()