In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer

spark_dr = spark.read.csv("dbfs:/FileStore/tables/data_merged_A2.csv",header=True)

spark_dr = spark_dr.drop("customerID")

spark_dr = spark_dr.withColumn("tenure", col("tenure").cast("double")) \
       .withColumn("MonthlyCharges", col("MonthlyCharges").cast("double")) \
       .withColumn("TotalCharges", col("TotalCharges").cast("double"))
       
imputer = Imputer(inputCols=["TotalCharges"], outputCols=["TotalCharges"], strategy="mean")  # Or "median"

spark_dr = imputer.fit(spark_dr).transform(spark_dr)



In [0]:
trainDF, testDF = spark_dr.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

5698
1345


In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

categoricalCols = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=[x + "Index" for x in categoricalCols]) 
encoder = OneHotEncoder(inputCols=stringIndexer.getOutputCols(), outputCols=[x + "OHE" for x in categoricalCols]) 

from pyspark.ml.feature import QuantileDiscretizer

# Apply QuantileDiscretizer to the TotalCharges column
discretizer = QuantileDiscretizer(numBuckets=4, inputCol="TotalCharges", outputCol="TotalChargesBucket")

# Fit the discretizer and transform the data
spark_dr = discretizer.fit(spark_dr).transform(spark_dr)

# Show the transformed data
spark_dr.select("TotalCharges", "TotalChargesBucket").show()

labelToIndex = StringIndexer(inputCol="Churn", outputCol="label")

+------------+------------------+
|TotalCharges|TotalChargesBucket|
+------------+------------------+
|       29.85|               0.0|
|      1889.5|               2.0|
|      108.15|               0.0|
|     1840.75|               2.0|
|      151.65|               0.0|
|       820.5|               1.0|
|      1949.4|               2.0|
|       301.9|               0.0|
|     3046.05|               2.0|
|     3487.95|               2.0|
|      587.45|               1.0|
|       326.8|               0.0|
|      5681.1|               3.0|
|      5036.3|               3.0|
|     2686.05|               2.0|
|     7895.15|               3.0|
|     1022.95|               1.0|
|     7382.25|               3.0|
|      528.35|               1.0|
|      1862.9|               2.0|
+------------+------------------+
only showing top 20 rows



In [0]:
stringIndexerModel = stringIndexer.fit(trainDF)
display(stringIndexerModel.transform(trainDF))

gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,genderIndex,SeniorCitizenIndex,PartnerIndex,DependentsIndex,PhoneServiceIndex,MultipleLinesIndex,InternetServiceIndex,OnlineSecurityIndex,OnlineBackupIndex,DeviceProtectionIndex,TechSupportIndex,StreamingTVIndex,StreamingMoviesIndex,ContractIndex,PaperlessBillingIndex
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,No,Bank transfer (automatic),25.25,25.25,No,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,No,Electronic check,24.6,24.6,Yes,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,No,Mailed check,24.4,24.4,No,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,No,Mailed check,25.2,25.2,Yes,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,24.9,24.9,No,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,25.15,25.15,No,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check,35.05,35.05,Yes,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,Yes,No,Month-to-month,Yes,Electronic check,34.7,34.7,Yes,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,No,Yes,No,Month-to-month,Yes,Electronic check,35.75,35.75,Yes,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Female,0,No,No,1.0,No,No phone service,DSL,No,No,No,Yes,No,No,Month-to-month,No,Mailed check,30.55,30.55,No,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [0]:
from pyspark.ml.feature import VectorAssembler

assemblerInputs = [c + "OHE" for c in categoricalCols] + ["TotalChargesBucket"]
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [0]:
from pyspark.ml.classification import LogisticRegression
 
lr = LogisticRegression(featuresCol="features", labelCol="label", regParam=1.0)

In [0]:
from pyspark.ml import Pipeline
 
pipeline = Pipeline(stages=[
    labelToIndex,       
    stringIndexer,       
    encoder,
    discretizer,               
    vecAssembler,        
    lr                   
])

# Train the model using the training dataset
pipelineModel = pipeline.fit(trainDF)

# Apply the model to the test dataset
predDF = pipelineModel.transform(testDF)

In [0]:
display(predDF.select("features", "label", "prediction", "probability"))

features,label,prediction,probability
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 8, 9, 11, 13, 15, 17, 19, 21), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6446378037616957, 0.35536219623830434))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 8, 9, 11, 13, 15, 17, 19, 21, 23), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",1.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6220356460729793, 0.37796435392702066))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 8, 9, 11, 13, 15, 17, 19), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6918434612528463, 0.30815653874715365))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 8, 9, 11, 14, 16, 17, 19, 21), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7255022218080446, 0.2744977781919554))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 4, 5, 8, 9, 11, 13, 15, 17, 19, 21), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",1.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6472001142629059, 0.3527998857370941))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 4, 5, 8, 9, 11, 13, 15, 17, 19, 21), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6472001142629059, 0.3527998857370941))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 4, 5, 8, 9, 11, 13, 15, 17, 19, 21, 23), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6246660393434577, 0.37533396065654234))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 4, 5, 8, 9, 11, 13, 15, 17, 19, 21, 23), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",1.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6246660393434577, 0.37533396065654234))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 4, 5, 8, 9, 11, 13, 16, 17, 19, 21, 23), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6808149042250511, 0.3191850957749489))"
"Map(vectorType -> sparse, length -> 25, indices -> List(1, 2, 3, 4, 5, 8, 9, 11, 13, 16, 17, 19, 21, 23), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",1.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6808149042250511, 0.3191850957749489))"


In [0]:
display(pipelineModel.stages[-1], predDF.drop("prediction", "rawPrediction", "probability"), "ROC")


False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.4681819943650173
0.0,0.0285714285714285,0.4681819943650173
0.0,0.0571428571428571,0.4542197723393274
0.0,0.0857142857142857,0.4500029440659517
0.0,0.1142857142857142,0.4465941737520684
0.0158730158730158,0.1142857142857142,0.4382761692068984
0.0158730158730158,0.1428571428571428,0.4380535709346733
0.0158730158730158,0.1714285714285714,0.4310060372956718
0.0158730158730158,0.2,0.4280056546266384
0.0158730158730158,0.2285714285714285,0.4226766931454758


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
 
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator.evaluate(predDF)}")
 
mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(predDF)}")

Area under ROC curve: 0.8362026589523512
Accuracy: 0.7330855018587361


In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
 
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

In [0]:
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=bcEvaluator, numFolds=3, parallelism = 4)
 
# Run cross validations. This step takes a few minutes and returns the best model found from the cross validation.
cvModel = cv.fit(trainDF)

In [0]:
cvPredDF = cvModel.transform(testDF)
 
# Evaluate the model's performance based on area under the ROC curve and accuracy 
print(f"Area under ROC curve: {bcEvaluator.evaluate(cvPredDF)}")
print(f"Accuracy: {mcEvaluator.evaluate(cvPredDF)}")

Area under ROC curve: 0.8483871696791268
Accuracy: 0.7962825278810409
