In [1]:
import pyspark
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("train_data .csv")

df = df.select('total_loan','year_of_loan','interest','monthly_payment','is_default')

In [2]:
train, test = df.randomSplit([0.7, 0.3])
train.show(5) #小数据集习惯性查看一下、我也不知道这习惯是好是坏


+----------+------------+--------+---------------+----------+
|total_loan|year_of_loan|interest|monthly_payment|is_default|
+----------+------------+--------+---------------+----------+
|    1000.0|           3|    6.62|          30.71|         1|
|    1000.0|           3|    6.99|          30.88|         0|
|    1000.0|           3|    7.35|          31.04|         0|
|    1000.0|           3|     7.4|          31.06|         0|
|    1000.0|           3|    7.69|           31.2|         0|
+----------+------------+--------+---------------+----------+
only showing top 5 rows



In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import RFormula
supervised = RFormula(formula="is_default ~ total_loan +year_of_loan+ interest +monthly_payment")
fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
train, test = preparedDF.randomSplit([0.7, 0.3])
train.show(5) #小数据集习惯性查看一下、我也不知道这习惯是好是坏



+----------+------------+--------+---------------+----------+--------------------+-----+
|total_loan|year_of_loan|interest|monthly_payment|is_default|            features|label|
+----------+------------+--------+---------------+----------+--------------------+-----+
|    1000.0|           3|    6.99|          30.88|         0|[1000.0,3.0,6.99,...|  0.0|
|    1000.0|           3|    7.35|          31.04|         0|[1000.0,3.0,7.35,...|  0.0|
|    1000.0|           3|     7.4|          31.06|         0|[1000.0,3.0,7.4,3...|  0.0|
|    1000.0|           3|    7.88|          31.29|         0|[1000.0,3.0,7.88,...|  0.0|
|    1000.0|           3|    7.89|          31.29|         0|[1000.0,3.0,7.89,...|  0.0|
+----------+------------+--------+---------------+----------+--------------------+-----+
only showing top 5 rows



In [7]:
lr = LogisticRegression(labelCol="label",featuresCol="features")
fittedLR = lr.fit(train)
fittedLR.transform(test).select("label", "prediction").show(30)

+-----+----------+
|label|prediction|
+-----+----------+
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
+-----+----------+
only showing top 30 rows



In [10]:
trainingSummary = fittedLR.summary
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))


+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|0.002973217258931545|0.011567601156760116|
|0.005298273155416013|0.020410602041060204|
| 0.00812282955140098| 0.03111780311178031|
|0.011238761238761238|0.041920604192060416|
| 0.01480067551496123| 0.05389450538945054|
|0.017904714333285762| 0.06496020649602065|
|0.020842252985110128| 0.07471140747114074|
|0.024410113695827983| 0.08568150856815086|
|  0.0278709385852243| 0.09524150952415095|
|0.028340706912135482| 0.09672330967233096|
| 0.03218210361067504| 0.10845821084582108|
| 0.03608296465439323|  0.1190937119093712|
| 0.04019789734075448| 0.12901221290122128|
| 0.04418795490224062|  0.1401735140173514|
|0.048861852433281006| 0.15200401520040152|
|0.053024356595785166| 0.16220931622093163|
| 0.05753175396032539| 0.17420711742071174|
|0.061872651158365446| 0.18424511842451186|
|  0.0662135483564055| 0.1953108

In [30]:
test_result = fittedLR.transform(test).select("label", "prediction")
TT = test_result.where("label=1 and prediction=1").count()
TF = test_result.where("label=0 and prediction=0").count()
total = test_result.count()
accuracy = (TT+TF)/total
print(accuracy)

0.7977797779777978
