In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.4.4-bin-hadoop2.7/')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('customer').getOrCreate()

In [3]:
df = spark.read.csv('data/customer_churn.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [4]:
df.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [8]:
df = df.select(['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Churn'])

In [10]:
df.describe().show()

+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+
|summary|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|              Churn|
+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+
|  count|              900|              900|               900|              900|               900|                900|
|   mean|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|0.16666666666666666|
| stddev|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| 0.3728852122772358|
|    min|             22.0|            100.0|                 0|              1.0|               3.0|                  0|
|    max|             65.0|         18026.01|                 1|             9.15|              14.0|                  1|
+-------+---------------

In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [12]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'],
                           outputCol='features')

In [13]:
data = assembler.transform(df)

In [15]:
data = data.select(['features', 'Churn'])

In [16]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [17]:
lr = LogisticRegression(featuresCol='features', labelCol='Churn')
lr_model = lr.fit(train_data)

In [18]:
results = lr_model.transform(test_data)

In [29]:
preds = lr_model.evaluate(test_data)

In [32]:
preds.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8939.61,0.0...|    0|[6.27708782428524...|[0.99812465929295...|       0.0|
|[27.0,8628.8,1.0,...|    0|[5.3717061548787,...|[0.99537528990447...|       0.0|
|[28.0,8670.98,0.0...|    0|[7.85084189323922...|[0.99961072761520...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.07930441607579...|[0.74636232796972...|       0.0|
|[28.0,11128.95,1....|    0|[4.07853425297856...|[0.98334966205808...|       0.0|
|[28.0,11204.23,0....|    0|[1.33110052950387...|[0.79102261700375...|       0.0|
|[28.0,11245.38,0....|    0|[3.62714485376138...|[0.97409681726566...|       0.0|
|[29.0,12711.15,0....|    0|[5.46209381862171...|[0.99577328208771...|       0.0|
|[29.0,13255.05,1....|    0|[4.15198465486767...|[0.98451053761381...|       0.0|
|[30.0,6744.87,0

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
auc = evaluator.evaluate(preds.predictions)

In [34]:
auc

0.7534414945919371