In [1]:
import findspark
findspark.init() 

In [18]:
#Importing the libraries
from pyspark.sql import SparkSession, Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from pyspark.ml.classification import RandomForestClassifier

In [3]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("Error")
spark.version

'3.3.0'

In [4]:
#Read the csv file
data = spark.read.csv("./Desktop/cleanWithHeader.csv", inferSchema=True, header=True)

In [7]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- member_id: integer (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- funded_amnt: integer (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- annual_inc: integer (nullable = true)
 |-- dti: double (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- inq_last_6mths: integer (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- total_acc: integer (nullable = true)
 |-- out_prncp: double (nullable = true)
 |-- out_prncp_inv: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- total_pymnt_inv: double (nullable = true)
 |-- total_rec_prncp: double (nullable = true)
 |-- total_rec_int: double (nullable = true)
 |-- total_rec_late_fee: double (nullable = true)
 |-- recoveries: double (nullable = true)
 |-- col

In [8]:
data_reordered = data.select("id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","int_rate","installment","annual_inc",
                             "dti","delinq_2yrs","inq_last_6mths","open_acc","pub_rec","revol_bal","total_acc","out_prncp","out_prncp_inv",
                             "total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries",
                             "collection_recovery_fee","revol_util_imputed","last_pymnt_amnt_imputed","collections_12_mths_ex_med_imputed",
                             "policy_code_imputed","tot_coll_amt_imputed","tot_cur_bal_imputed","total_rev_hi_lim_imputed","pymnt_plan_index",
                             "application_type_index","acc_now_delinq_index","grade_index","purpose_index","home_ownership_index",
                             "emp_length_index","verification_status_index","sub_grade_index","term_index","initial_list_status_index", "default_ind")

In [9]:
features = ["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","int_rate","installment","annual_inc",
                             "dti","delinq_2yrs","inq_last_6mths","open_acc","pub_rec","revol_bal","total_acc","out_prncp","out_prncp_inv",
                             "total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries",
                             "collection_recovery_fee","revol_util_imputed","last_pymnt_amnt_imputed","collections_12_mths_ex_med_imputed",
                             "policy_code_imputed","tot_coll_amt_imputed","tot_cur_bal_imputed","total_rev_hi_lim_imputed","pymnt_plan_index",
                             "application_type_index","acc_now_delinq_index","grade_index","purpose_index","home_ownership_index",
                             "emp_length_index","verification_status_index","sub_grade_index","term_index","initial_list_status_index"]


In [10]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
va = VectorAssembler(inputCols =features, outputCol="features")

In [11]:
va_df = va.transform(data)


In [12]:
va_df = va_df.select(['features', 'default_ind'])
va_df.show()

+--------------------+-----------+
|            features|default_ind|
+--------------------+-----------+
|(42,[0,1,2,3,4,5,...|          0|
|[1077430.0,131416...|          1|
|[1077175.0,131352...|          0|
|[1076863.0,127717...|          0|
|[1075358.0,131174...|          0|
|[1075269.0,131144...|          0|
|[1069639.0,130474...|          0|
|[1072053.0,128868...|          0|
|[1071795.0,130695...|          1|
|[1071570.0,130672...|          1|
|[1070078.0,130520...|          0|
|(42,[0,1,2,3,4,5,...|          0|
|[1064687.0,129871...|          1|
|(42,[0,1,2,3,4,5,...|          0|
|[1069057.0,130350...|          1|
|[1069759.0,130487...|          0|
|[1065775.0,129969...|          0|
|(42,[0,1,2,3,4,5,...|          0|
|(42,[0,1,2,3,4,5,...|          0|
|(42,[0,1,2,3,4,5,...|          0|
+--------------------+-----------+
only showing top 20 rows



In [13]:
seed = 142
train_df, test_df = va_df.randomSplit([0.7,0.3], seed)

In [19]:
model = RandomForestClassifier(featuresCol = 'features', labelCol = 'default_ind' )

In [20]:
nbmodel = model.fit(train_df)

In [21]:
pred  = nbmodel.transform(test_df)
pred.show(3)

+--------------------+-----------+--------------------+--------------------+----------+
|            features|default_ind|       rawPrediction|         probability|prediction|
+--------------------+-----------+--------------------+--------------------+----------+
|(42,[0,1,2,3,4,5,...|          0|[18.8776458244775...|[0.94388229122387...|       0.0|
|(42,[0,1,2,3,4,5,...|          1|[14.1653291900552...|[0.70826645950276...|       0.0|
|(42,[0,1,2,3,4,5,...|          0|[18.7805254825791...|[0.93902627412895...|       0.0|
+--------------------+-----------+--------------------+--------------------+----------+
only showing top 3 rows



In [22]:
evaluator= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="default_ind")
accuracy = evaluator.evaluate(pred)
 
print("Prediction Accuracy: ", accuracy)
 
y_pred=pred.select("prediction").collect()
y_orig=pred.select("default_ind").collect()

Prediction Accuracy:  0.9777768322301527


In [23]:
print ("Test Error = ",  (1.0 - accuracy))


Test Error =  0.02222316776984734
