In [56]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [57]:
spark = SparkSession.builder.appName('logicalRegressionProj').getOrCreate()

In [58]:
data = spark.read.csv('customer_churn.csv',header=True,inferSchema=True)

In [59]:
data.show()
data.printSchema()
data.columns

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [60]:
assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],
outputCol= 'features')

In [61]:
assembled_data = assembler.transform(data)

In [62]:
final_assembled_data = assembled_data.select('features','churn')

In [63]:
train,test = final_assembled_data.randomSplit([0.7,0.3])

In [64]:
lr = LogisticRegression(labelCol='churn')

In [65]:
model = lr.fit(train)

In [66]:
tested = model.evaluate(test)

In [67]:
model.summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                608|                608|
|   mean|            0.15625|0.11019736842105263|
| stddev|0.36339115309576564| 0.3133934379948572|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [68]:
tested.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[28.0,8670.98,0.0...|    0|[7.64127242226384...|[0.99952001347339...|       0.0|
|[28.0,11128.95,1....|    0|[4.19580251695236...|[0.98516474614924...|       0.0|
|[28.0,11245.38,0....|    0|[3.71007959915461...|[0.97610916691692...|       0.0|
|[29.0,5900.78,1.0...|    0|[4.25793020329649...|[0.98604590921807...|       0.0|
|[29.0,13255.05,1....|    0|[4.11691143812059...|[0.98396649794533...|       0.0|
|[30.0,8403.78,1.0...|    0|[5.95992737641325...|[0.99742654044002...|       0.0|
|[30.0,8677.28,1.0...|    0|[4.40793815726403...|[0.98796630732985...|       0.0|
|[30.0,8874.83,0.0...|    0|[3.08276252589166...|[0.95617608975238...|       0.0|
|[30.0,10744.14,1....|    1|[1.89872770622214...|[0.86974745931765...|       0.0|
|[31.0,5304.6,0.

In [69]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='churn')

In [70]:
bineval = evaluator.evaluate(tested.predictions)

In [71]:
#area under curve
bineval

0.7677023398542386

In [72]:
model_assembled_data = lr.fit(final_assembled_data)

In [73]:
new_customers = spark.read.csv('new_customers.csv',header=True,inferSchema=True)

In [74]:
new_data = assembler.transform(new_customers)

In [78]:
final_new_data = model_assembled_data.transform(new_data)

In [82]:
final_new_data.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

