# Big Data Machine Learning Classification with Spark

------------Big Data Churn prediction with Spark--------------

I will use pyspark machine learning package to predict customers' behavior.

In [3]:
#pip install pyspark

In [35]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
spark = SparkSession.builder.appName("ChurnGBTClassifier").getOrCreate()

In [37]:
data = spark.read.csv("churn.csv",inferSchema=True,header=True)

In [7]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Churn: integer (nullable = true)



In [8]:
data.show()

+---+-------------------+----+--------------+---------------+-----+---------+-----+
|_c0|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|
+---+-------------------+----+--------------+---------------+-----+---------+-----+
|  0|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|  1|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|  2|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|  3|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
|  4|     Cynthia Norton|37.0|       9191.58|              0| 5.56|      9.0|    1|
|  5|   Jessica Williams|48.0|      10356.02|              0| 5.12|      8.0|    1|
|  6|        Eric Butler|44.0|      11331.58|              1| 5.23|     11.0|    1|
|  7|      Zachary Walsh|32.0|       9885.12|              1| 6.92|      9.0|    1|
|  8|        Ashlee Carr|43.0|       14062.6|              1| 5.46|     11.0

In [11]:
data.columns

['_c0',
 'Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Churn']

In [13]:
data.count()

900

In [14]:
data.groupBy("Churn").count().show()

+-----+-----+
|Churn|count|
+-----+-----+
|    1|  150|
|    0|  750|
+-----+-----+



In [15]:
data.describe().show()

+-------+------------------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+
|summary|               _c0|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|              Churn|
+-------+------------------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+
|  count|               900|          900|              900|              900|               900|              900|               900|                900|
|   mean|             449.5|         NULL|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|0.16666666666666666|
| stddev|259.95191863111916|         NULL|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| 0.3728852122772358|
|    min|                 0|   Aaron King|             22.0|          

In [22]:
data.groupby("Age").agg({"Churn": "count"}).show()

+----+------------+
| Age|count(Churn)|
+----+------------+
|49.0|          30|
|29.0|           9|
|47.0|          29|
|42.0|          49|
|44.0|          53|
|35.0|          32|
|39.0|          48|
|37.0|          48|
|34.0|          25|
|25.0|           1|
|36.0|          39|
|41.0|          69|
|56.0|           5|
|50.0|          15|
|45.0|          56|
|31.0|          11|
|58.0|           2|
|51.0|          21|
|48.0|          36|
|22.0|           1|
+----+------------+
only showing top 20 rows



In [23]:
data.groupby("Account_Manager").agg({"Churn": "count"}).show()

+---------------+------------+
|Account_Manager|count(Churn)|
+---------------+------------+
|              1|         433|
|              0|         467|
+---------------+------------+



In [24]:
data.groupby("Total_Purchase").agg({"Churn": "count"}).show()

+--------------+------------+
|Total_Purchase|count(Churn)|
+--------------+------------+
|      10522.21|           1|
|      10806.13|           1|
|      10697.72|           1|
|      11580.56|           1|
|       11334.3|           1|
|      11743.24|           1|
|      10617.16|           1|
|      10338.09|           1|
|       10491.4|           1|
|       7686.13|           1|
|      10183.98|           1|
|       9261.41|           1|
|      11768.71|           1|
|       9617.59|           1|
|      11119.11|           1|
|      13532.85|           1|
|       8011.38|           1|
|       9324.49|           1|
|      10746.37|           1|
|      11222.48|           1|
+--------------+------------+
only showing top 20 rows



Modelling

In [41]:
data = data.drop("Names")

In [42]:

feature_columns = data.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data).select("features", "Churn")

In [43]:
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [44]:
gbt = GBTClassifier(labelCol="Churn", featuresCol="features")
model = gbt.fit(train_data)

In [45]:
predictions = model.transform(test_data)

In [46]:
evaluator = BinaryClassificationEvaluator(labelCol="Churn")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9973262032085561


The accuracy of 0.9973262032085561 indicates that the model correctly predicted the churn status of 99.73% of the data points in the test set. This is a very high level of accuracy, suggesting that the model is very good at predicting churn.