In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('logregconsult').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/23 10:45:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = spark.read.csv('data/customer_churn.csv', inferSchema=True, header=True)
print(data.count(), len(data.columns))
data.printSchema()

3150 16
root
 |-- Call  Failure: integer (nullable = true)
 |-- Complains: integer (nullable = true)
 |-- Subscription  Length: integer (nullable = true)
 |-- Charge  Amount: integer (nullable = true)
 |-- Seconds of Use: integer (nullable = true)
 |-- Frequency of use: integer (nullable = true)
 |-- Frequency of SMS: integer (nullable = true)
 |-- Distinct Called Numbers: integer (nullable = true)
 |-- Age Group: integer (nullable = true)
 |-- Tariff Plan: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Customer Value: double (nullable = true)
 |-- FN: double (nullable = true)
 |-- FP: double (nullable = true)
 |-- Churn: integer (nullable = true)



In [4]:
data.show(5)

+-------------+---------+--------------------+--------------+--------------+----------------+----------------+-----------------------+---------+-----------+------+---+--------------+--------+-------+-----+
|Call  Failure|Complains|Subscription  Length|Charge  Amount|Seconds of Use|Frequency of use|Frequency of SMS|Distinct Called Numbers|Age Group|Tariff Plan|Status|Age|Customer Value|      FN|     FP|Churn|
+-------------+---------+--------------------+--------------+--------------+----------------+----------------+-----------------------+---------+-----------+------+---+--------------+--------+-------+-----+
|            8|        0|                  38|             0|          4370|              71|               5|                     17|        3|          1|     1| 30|        197.64| 177.876| 69.764|    0|
|            0|        0|                  39|             0|           318|               5|               7|                      4|        2|          1|     2| 25|        4

In [5]:
data.columns

['Call  Failure',
 'Complains',
 'Subscription  Length',
 'Charge  Amount',
 'Seconds of Use',
 'Frequency of use',
 'Frequency of SMS',
 'Distinct Called Numbers',
 'Age Group',
 'Tariff Plan',
 'Status',
 'Age',
 'Customer Value',
 'FN',
 'FP',
 'Churn']

In [6]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols=[
    'Call  Failure',
    'Complains',
    'Subscription  Length',
    'Charge  Amount',
    'Seconds of Use',
    'Frequency of use',
    'Frequency of SMS',
    'Distinct Called Numbers',
    'Age Group',
    'Tariff Plan',
    'Status',
    'Age',
    'Customer Value'], 
    outputCol='features'
)
output = assembler.transform(data)
final_data = output.select('features', 'churn')
train_churn, test_churn = final_data.randomSplit([0.7,0.3])
print(train_churn.count())
print(test_churn.count())

2209
941


In [13]:
from pyspark.ml.classification import LogisticRegression

In [14]:
lr_churn = LogisticRegression(labelCol='churn')

In [15]:
fitted_churn_model = lr_churn.fit(train_churn)

23/02/23 10:48:47 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/02/23 10:48:47 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [17]:
training_sum = fitted_churn_model.summary
# training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|               2209|               2209|
|   mean|0.16070620190131282|0.10140334993209597|
| stddev| 0.3673428989388941|0.30193042078001403|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [22]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)
pred_and_labels.predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(13,[1,2,8,9,10,1...|    1|[-3.1664550373954...|[0.04044777772265...|       1.0|
|(13,[1,2,8,9,10,1...|    1|[-3.3161101962061...|[0.03502263101725...|       1.0|
|(13,[1,2,8,9,10,1...|    1|[-4.1025803261229...|[0.01626117098832...|       1.0|
|(13,[1,2,8,9,10,1...|    1|[-4.1823593271527...|[0.01503301519606...|       1.0|
|(13,[2,3,8,9,10,1...|    0|[1.67478694091800...|[0.84221300156823...|       0.0|
|(13,[2,6,8,9,10,1...|    0|[-0.2625472697719...|[0.43473763628436...|       1.0|
|(13,[2,6,8,9,10,1...|    0|[-0.1198847830565...|[0.47006464912345...|       1.0|
|(13,[2,6,8,9,10,1...|    1|[0.64803950276190...|[0.65656853320884...|       0.0|
|(13,[2,6,8,9,10,1...|    0|[0.52582042736535...|[0.62850776658490...|       0.0|
|(13,[2,8,9,10,1

In [None]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='churn')
auc = eval.evaluate(pred_and_labels.predictions)
print(auc)

0.7224317817014446


### predict on new data

In [None]:
final_lr_model = lr_churn.fit(final_data)