In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import functions as f
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, 
                                    OneHotEncoder, StringIndexer)
from pyspark.ml import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
spark = SparkSession.builder.appName('customer').getOrCreate()

df = spark.read.csv('customer_churn.csv', inferSchema=True, header=True)
df.printSchema()

22/02/16 19:05:40 WARN Utils: Your hostname, ganesh-pi resolves to a loopback address: 127.0.1.1; using 192.168.1.119 instead (on interface eth0)
22/02/16 19:05:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/16 19:05:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



                                                                                

In [18]:
str_cols= [item[0] for item in df.dtypes if item[1].startswith('string')]
df.select(str_cols).show()

+-------------------+-------------------+--------------------+--------------------+
|              Names|       Onboard_date|            Location|             Company|
+-------------------+-------------------+--------------------+--------------------+
|   Cameron Williams|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|
|      Kevin Mueller|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|
|        Eric Lozano|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|
|      Phillip White|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|
|     Cynthia Norton|2016-01-19 15:31:15|765 Tricia Row Ka...|          Love-Jones|
|   Jessica Williams|2009-03-03 23:13:37|6187 Olson Mounta...|        Kelly-Warren|
|        Eric Butler|2016-12-05 03:35:43|4846 Savannah Roa...|   Reynolds-Sheppard|
|      Zachary Walsh|2006-03-09 14:50:20|25271 Roy Express...|          Singh-Cole|
|        Ashlee Carr|2011-09-29 05:47:23|3725 Caroline Str...|           Lop

In [11]:
df.select(['Age', 'Total_Purchase','Years' ,'Account_Manager'
            ,'Num_Sites', 'Churn']).show()

+----+--------------+-----+---------------+---------+-----+
| Age|Total_Purchase|Years|Account_Manager|Num_Sites|Churn|
+----+--------------+-----+---------------+---------+-----+
|42.0|       11066.8| 7.22|              0|      8.0|    1|
|41.0|      11916.22|  6.5|              0|     11.0|    1|
|38.0|      12884.75| 6.67|              0|     12.0|    1|
|42.0|       8010.76| 6.71|              0|     10.0|    1|
|37.0|       9191.58| 5.56|              0|      9.0|    1|
|48.0|      10356.02| 5.12|              0|      8.0|    1|
|44.0|      11331.58| 5.23|              1|     11.0|    1|
|32.0|       9885.12| 6.92|              1|      9.0|    1|
|43.0|       14062.6| 5.46|              1|     11.0|    1|
|40.0|       8066.94| 7.11|              1|     11.0|    1|
|30.0|      11575.37| 5.22|              1|      8.0|    1|
|45.0|       8771.02| 6.64|              1|     11.0|    1|
|45.0|       8988.67| 4.84|              1|     11.0|    1|
|40.0|       8283.32|  5.1|             

In [12]:
selected_data_df = df.select(['Age', 'Total_Purchase','Years' ,'Account_Manager'
            ,'Num_Sites', 'Churn'])

#Checking for nulls in the df
selected_data_df.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in selected_data_df.columns]).show()

+---+--------------+-----+---------------+---------+-----+
|Age|Total_Purchase|Years|Account_Manager|Num_Sites|Churn|
+---+--------------+-----+---------------+---------+-----+
|  0|             0|    0|              0|        0|    0|
+---+--------------+-----+---------------+---------+-----+



In [19]:
selected_data_df.drop('Churn').columns

['Age', 'Total_Purchase', 'Years', 'Account_Manager', 'Num_Sites']

In [26]:
#Creating a vector assembler for the features
assembler = VectorAssembler(inputCols=selected_data_df.drop('Churn').columns, outputCol='features')

vectorised_df=assembler.transform(selected_data_df)

#Splitting the data into train and test
train_data, test_data = vectorised_df.randomSplit([0.7, 0.3])

#defining the l0g_reg model
lr_model = LogisticRegression(featuresCol='features', labelCol='Churn')

fitted_lr_model = lr_model.fit(train_data)

                                                                                

In [27]:
#Evaluating the model 
results = fitted_lr_model.evaluate(test_data)
results.predictions.show()

+----+--------------+-----+---------------+---------+-----+--------------------+--------------------+--------------------+----------+
| Age|Total_Purchase|Years|Account_Manager|Num_Sites|Churn|            features|       rawPrediction|         probability|prediction|
+----+--------------+-----+---------------+---------+-----+--------------------+--------------------+--------------------+----------+
|22.0|      11254.38| 4.96|              1|      8.0|    0|[22.0,11254.38,4....|[4.34575533387472...|[0.98720414212423...|       0.0|
|25.0|       9672.03| 5.49|              0|      8.0|    0|[25.0,9672.03,5.4...|[4.54459241754310...|[0.98948719116593...|       0.0|
|28.0|       8670.98| 3.99|              0|      6.0|    0|[28.0,8670.98,3.9...|[7.47240060770255...|[0.99943176131525...|       0.0|
|29.0|       5900.78| 5.56|              1|      8.0|    0|[29.0,5900.78,5.5...|[3.79099750178100...|[0.97792522166769...|       0.0|
|29.0|      12711.15| 5.74|              0|      7.0|    0|[29

In [31]:
results.predictions.printSchema()

root
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Years: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [32]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn', metricName='areaUnderPR')
evaluator.evaluate(results.predictions)

                                                                                

0.6609279609279609

Predicting on new customers

In [35]:
pipeline = Pipeline(stages = [assembler, lr_model])

In [34]:
new_cust = spark.read.csv('new_customers.csv', inferSchema=True, header=True)
new_cust.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [37]:
fitted_new_cust = assembler.transform(new_cust)
fitted_new_cust.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|            features|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|[37.0,9935.53,7.7...|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|[23.0,7526.94,9.2...|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|[65.0,100.0,1.0,1...|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   

In [40]:
fitted_lr_new_cust = fitted_lr_model.transform(fitted_new_cust)
fitted_lr_new_cust.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|            features|       rawPrediction|         probability|prediction|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|[37.0,9935.53,7.7...|[2.15178872685352...|[0.89583580838231...|       0.0|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|[23.0,7526.94,9.2...|[-5.8604247902934...|[0.00284193315665...|       

In [41]:
fitted_lr_new_cust.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+



In [42]:
#Thanks for reading