In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import RFormula, VectorAssembler, StringIndexer
from pyspark.sql.functions import round
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                   MulticlassClassificationEvaluator)

In [2]:
spark = SparkSession.builder.appName("customer_churn_logistic_regression").getOrCreate()

churn = spark.read.load('../data/raw/churn.csv', format='csv', header=True,
                        inferSchema=True, sep=';')
print("Number of instances in Churn dataset: ", churn.count())
churn.show()

Number of instances in Churn dataset:  10000
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
|        645|    Spain|  Male| 44|     8|11375578| 

In [4]:
indexer = StringIndexer(inputCols=['Geography', 'Gender'],
                        outputCols=['GeographyIndex', 'GenderIndex'])
churn = indexer.fit(churn).transform(churn)
churn = churn.drop('Geography', 'Gender')

assembler = VectorAssembler(inputCols=churn.columns, outputCol='corr_features')
churn_assembled = assembler.transform(churn).select('corr_features')

corr_matrix = Correlation.corr(churn_assembled, 'corr_features')
corr_matrix = corr_matrix.collect()[0][corr_matrix.columns[0]].toArray()
corr_matrix = spark.createDataFrame(corr_matrix.tolist(), churn.columns)
corr_matrix.select([round(c, 3).alias(c) for c in corr_matrix.columns]).show()

+-----------+------+------+-------+-------------+---------+--------------+---------------+------+--------------+-----------+
|CreditScore|   Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|GeographyIndex|GenderIndex|
+-----------+------+------+-------+-------------+---------+--------------+---------------+------+--------------+-----------+
|        1.0|-0.004| 0.001|  0.007|        0.012|   -0.005|         0.026|         -0.001|-0.027|         0.008|      0.003|
|     -0.004|   1.0| -0.01|  0.022|       -0.031|   -0.012|         0.085|         -0.015| 0.285|         0.023|      0.028|
|      0.001| -0.01|   1.0| -0.017|        0.013|    0.023|        -0.028|          0.006|-0.014|         0.004|     -0.015|
|      0.007| 0.022|-0.017|    1.0|       -0.276|   -0.011|        -0.011|          0.006| 0.106|         0.063|     -0.007|
|      0.012|-0.031| 0.013| -0.276|          1.0|    0.003|          0.01|          0.014|-0.048|         0.004|      0.022|


In [5]:
r_formula = RFormula(formula="Exited ~ .")
churn_rf = r_formula.fit(churn).transform(churn)
churn_rf.select('features', 'label').show()

churn_train, churn_test = churn_rf.randomSplit([0.7, 0.3])
print("Number of training instances: ", churn_train.count())
print("Number of testing instances: ", churn_test.count())

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[619.0,42.0,2.0,0...|  1.0|
|[608.0,41.0,1.0,8...|  0.0|
|[502.0,42.0,8.0,1...|  1.0|
|[699.0,39.0,1.0,0...|  0.0|
|[850.0,43.0,2.0,1...|  0.0|
|[645.0,44.0,8.0,1...|  1.0|
|[822.0,50.0,7.0,0...|  0.0|
|[376.0,29.0,4.0,1...|  1.0|
|[501.0,44.0,4.0,1...|  0.0|
|[684.0,27.0,2.0,1...|  0.0|
|[528.0,31.0,6.0,1...|  0.0|
|[497.0,24.0,3.0,0...|  0.0|
|[476.0,34.0,10.0,...|  0.0|
|[549.0,25.0,5.0,0...|  0.0|
|[635.0,35.0,7.0,0...|  0.0|
|[616.0,45.0,3.0,1...|  0.0|
|[653.0,58.0,1.0,1...|  1.0|
|[549.0,24.0,9.0,0...|  0.0|
|[587.0,45.0,6.0,0...|  0.0|
|[726.0,24.0,6.0,0...|  0.0|
+--------------------+-----+
only showing top 20 rows

Number of training instances:  6981
Number of testing instances:  3019


In [6]:
logistic_regressor = LogisticRegression()
model = logistic_regressor.fit(churn_train)

summary = model.summary
print("Model evaluation on training set:")
print("Accuracy: ", summary.accuracy)
print("Weighted precision: ", summary.weightedPrecision)
print("Weighted recall: ", summary.weightedRecall)
print("Area under the ROC curve: ", summary.areaUnderROC)

Model evaluation on training set:
Accuracy:  0.8069044549491476
Weighted precision:  0.7746933865604648
Weighted recall:  0.8069044549491476
Area under the ROC curve:  0.7501193616290383


In [9]:
pred = model.transform(churn_test)
pred.select('label', 'prediction', 'probability', 'rawPrediction').show()

+-----+----------+--------------------+--------------------+
|label|prediction|         probability|       rawPrediction|
+-----+----------+--------------------+--------------------+
|  1.0|       0.0|[0.78900441963974...|[1.31893473530469...|
|  1.0|       0.0|[0.82634108400320...|[1.55991450040320...|
|  1.0|       1.0|[0.22891499036522...|[-1.2144479128928...|
|  1.0|       0.0|[0.65749725589955...|[0.65216092086410...|
|  1.0|       0.0|[0.76300381219924...|[1.16921897182437...|
|  1.0|       0.0|[0.55070471556826...|[0.20351843843974...|
|  0.0|       0.0|[0.70493513526288...|[0.87091057922564...|
|  0.0|       0.0|[0.90444045407400...|[2.24756689896306...|
|  0.0|       0.0|[0.87305188911869...|[1.92821656454982...|
|  0.0|       0.0|[0.71198522738733...|[0.90504539053078...|
|  0.0|       0.0|[0.83323468611866...|[1.60872782057528...|
|  0.0|       0.0|[0.90932823846959...|[2.30546015802435...|
|  1.0|       1.0|[0.31540045100419...|[-0.7749909628394...|
|  1.0|       0.0|[0.949

In [10]:
binary_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
print("Model evaluation on testing set:")
print("Area under the ROC curve:", binary_evaluator.evaluate(pred))
binary_evaluator.setMetricName('areaUnderPR')
print("Area under the PR curve:", binary_evaluator.evaluate(pred))

multiclass_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
print("Accuracy:", multiclass_evaluator.evaluate(pred))
multiclass_evaluator.setMetricName('weightedPrecision')
print("Weighted precision:", multiclass_evaluator.evaluate(pred))
multiclass_evaluator.setMetricName('weightedRecall')
print("Weighted recall:", multiclass_evaluator.evaluate(pred))
multiclass_evaluator.setMetricName('f1')
print("F1-score:", multiclass_evaluator.evaluate(pred))

Model evaluation on testing set:
Area under the ROC curve: 0.7639004149377578
Area under the PR curve: 0.4416876652179284
Accuracy: 0.8095395826432593
Weighted precision: 0.7769765244649556
Weighted recall: 0.8095395826432594
F1-score: 0.7691634372471671
