## Predicting custumer churn using logistic regression

In [1]:
import findspark

findspark.init('/home/guipleite/spark-3.0.2-bin-hadoop3.2')

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('LogReg_exe').getOrCreate()

df = spark.read.csv('customer_churn.csv', inferSchema=True, header=True)

df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [2]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [3]:
# Selecting only the relevant columns
rel_cols = df.select(['Age',
                      'Total_Purchase',
                      'Account_Manager',
                      'Years',
                      'Num_Sites',
                      'Churn'])

clean_data = rel_cols.na.drop() # Removing rows with missing data

In [4]:
# Assembling features
assembler = VectorAssembler(inputCols=['Age',
                                       'Total_Purchase',
                                       'Account_Manager',
                                       'Years',
                                       'Num_Sites'],
                            outputCol='features'
                            ) 

log_reg = LogisticRegression(featuresCol='features', labelCol='Churn')
# output = assembler.transform(clean_data)

In [5]:
pipeline = Pipeline(stages=[assembler,log_reg])  # Creating pipeline

In [6]:
train_data, test_data = clean_data.randomSplit([0.8,0.2]) # Splitting data into training and testing datasets

In [7]:
fit_model = pipeline.fit(train_data)

results = fit_model.transform(test_data)

eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')

In [8]:
eval.evaluate(results)

0.7467008797653959