In [46]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import getpass

username = getpass.getuser()

In [47]:
spark = SparkSession.\
        builder.\
        config('spark.ui.port','0').\
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse').\
        config('spark.shuffle.useOldFetchProtocol','true').\
        enableHiveSupport().\
        master('yarn').\
        getOrCreate()


In [48]:
loans_schema = "loan_id string,member_id string ,loan_amount float,funded_amount float ,loan_term_months string ,interest_rate float ,monthly_installment float ,issue_date string ,loan_status string ,loan_purpose string ,loan_title string"

In [49]:
loans_raw_df = spark .read.csv("/user/itv015278/lendingclubproject/raw/loans_data_csv",header = True , schema = loans_schema) 

In [50]:
loans_ingested_date = loans_raw_df.withColumn("ingest_date" , current_timestamp())

In [51]:
columns_to_check = ["loan_amount","funded_amount","loan_term_months","interest_rate","monthly_installment","issue_date","loan_status","loan_purpose"]

In [52]:
loans_filterd_df = loans_ingested_date.na.drop(subset=columns_to_check)

In [53]:
loans_filterd_df.createOrReplaceTempView("loans")

In [54]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other,2025-01-01 06:32:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2025-01-01 06:32:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2025-01-01 06:32:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2025-01-01 06:32:...
76272510,d3792868819649ba9...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2025-01-01 06:32:...


In [55]:
loans_months_transformed = loans_filterd_df. \
withColumn("loan_term_months" , (regexp_replace(col("loan_term_months") , "(\D)","") \
                                 .cast("int")/12).cast("int")) \
.withColumnRenamed("loan_term_months","loan_term_years")

In [56]:
loans_months_transformed

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2025-01-01 06:32:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2025-01-01 06:32:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2025-01-01 06:32:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2025-01-01 06:32:...
76272510,d3792868819649ba9...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2025-01-01 06:32:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2025-01-01 06:32:...


In [57]:
loan_purpose_to_be_considered = ["debt_consolidation","credit_card","home_improvement","other","major_purchase","medical","small_business","car","vacation","moving","house","wedding","renewable_energy","educational"]

In [59]:
loans_loan_status_transformed = loans_months_transformed.withColumn("loan_purpose", when( col("loan_purpose").isin(loan_purpose_to_be_considered) , col("loan_purpose") ).otherwise("other"))

In [None]:
#loans_loan_status_transformed.groupBy("loan_purpose").agg( count("loan_purpose").alias("Total_count") , avg("loan_amount").alias("AverageLoanAmount") ).orderBy(col("AverageLoanAmount"))

In [66]:
loans_loan_status_transformed.write.mode("overwrite").option("path","/user/itv015278/lendingclubproject/cleaned/loans_parquet").save()

In [67]:
 loans_loan_status_transformed.rdd.getNumPartitions()

3