In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline


spark = SparkSession.builder.appName('LogisticRegressionWithMLflow').getOrCreate()
input_data = spark.read.csv("../data/processed/data.csv", header=True, inferSchema=True)
# input_data.printSchema()

feature_cols = [col for col in input_data.columns if col != 'Churn_indexed']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
train_data, test_data = input_data.randomSplit([0.8, 0.2], seed=42)

# Logistic Regression
params = {
    "featuresCol": "features",
    "labelCol": "Churn_indexed",
    "maxIter": 1000
}

lr = LogisticRegression(**params)
pipeline = Pipeline(stages=[assembler, lr])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol='Churn_indexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
recall_class_0 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0})
recall_class_1 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1})
f1_score_macro = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Recall for Class 0: {recall_class_0}")
print(f"Recall for Class 1: {recall_class_1}")
print(f"F1 Score Macro: {f1_score_macro}")

root
 |-- SeniorCitizen_scaled: double (nullable = true)
 |-- tenure_scaled: double (nullable = true)
 |-- MonthlyCharges_scaled: double (nullable = true)
 |-- TotalCharges_scaled: double (nullable = true)
 |-- Partner_indexed: double (nullable = true)
 |-- Dependents_indexed: double (nullable = true)
 |-- PhoneService_indexed: double (nullable = true)
 |-- MultipleLines_indexed: double (nullable = true)
 |-- InternetService_indexed: double (nullable = true)
 |-- OnlineSecurity_indexed: double (nullable = true)
 |-- OnlineBackup_indexed: double (nullable = true)
 |-- DeviceProtection_indexed: double (nullable = true)
 |-- TechSupport_indexed: double (nullable = true)
 |-- StreamingTV_indexed: double (nullable = true)
 |-- StreamingMovies_indexed: double (nullable = true)
 |-- Contract_indexed: double (nullable = true)
 |-- PaperlessBilling_indexed: double (nullable = true)
 |-- PaymentMethod_indexed: double (nullable = true)
 |-- Churn_indexed: double (nullable = true)

Accuracy: 0.811

In [4]:
from pyspark.ml.classification import DecisionTreeClassifier

# Decision Tree Classifier
dt_params = {
    "featuresCol": "features",
    "labelCol": "Churn_indexed",
    "maxDepth": 5, 
    "impurity": "gini"
}

dt = DecisionTreeClassifier(**dt_params)
pipeline = Pipeline(stages=[assembler, dt])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol='Churn_indexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
recall_class_0 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0})
recall_class_1 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1})
f1_score_macro = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Recall for Class 0: {recall_class_0}")
print(f"Recall for Class 1: {recall_class_1}")
print(f"F1 Score Macro: {f1_score_macro}")

Accuracy: 0.7990936555891238
Recall for Class 0: 0.8878787878787879
Recall for Class 1: 0.5359281437125748
F1 Score Macro: 0.7941941837006201


In [5]:
from pyspark.ml.classification import RandomForestClassifier

# Random Forest Classifier
rf_params = {
    "featuresCol": "features",
    "labelCol": "Churn_indexed",
    "numTrees": 100,  # Number of trees in the forest
    "maxDepth": 5,    # Maximum depth of each tree
    "impurity": "gini",
    "featureSubsetStrategy": "auto"  # Auto strategy for feature subset for training each tree
}

rf = RandomForestClassifier(**rf_params)
pipeline = Pipeline(stages=[assembler, rf])

# Fit the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='Churn_indexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
recall_class_0 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0})
recall_class_1 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1})
f1_score_macro = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Recall for Class 0: {recall_class_0}")
print(f"Recall for Class 1: {recall_class_1}")
print(f"F1 Score Macro: {f1_score_macro}")

Accuracy: 0.8006042296072508
Recall for Class 0: 0.9414141414141414
Recall for Class 1: 0.38323353293413176
F1 Score Macro: 0.7791625531409281


In [7]:
from pyspark.ml.classification import LinearSVC

# Linear SVC Parameters
svc_params = {
    "featuresCol": "features",
    "labelCol": "Churn_indexed",
    "maxIter": 100,
    "regParam": 0.1  # Regularization parameter
}

svc = LinearSVC(**svc_params)
pipeline = Pipeline(stages=[assembler, svc])

# Fit the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='Churn_indexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
recall_class_0 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0})
recall_class_1 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1})
f1_score_macro = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Recall for Class 0: {recall_class_0}")
print(f"Recall for Class 1: {recall_class_1}")
print(f"F1 Score Macro: {f1_score_macro}")


Accuracy: 0.8089123867069486
Recall for Class 0: 0.9222222222222223
Recall for Class 1: 0.47305389221556887
F1 Score Macro: 0.7968384769206691


In [8]:
from pyspark.ml.classification import GBTClassifier

# GBT Classifier
gbt_params = {
    "featuresCol": "features",
    "labelCol": "Churn_indexed",
    "maxIter": 100,
    "maxDepth": 5,
    "lossType": "logistic"
}

gbt = GBTClassifier(**gbt_params)
pipeline = Pipeline(stages=[assembler, gbt])

model = pipeline.fit(train_data)

predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol='Churn_indexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
recall_class_0 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0})
recall_class_1 = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1})
f1_score_macro = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Recall for Class 0: {recall_class_0}")
print(f"Recall for Class 1: {recall_class_1}")
print(f"F1 Score Macro: {f1_score_macro}")

spark.stop()

Accuracy: 0.8285498489425982
Recall for Class 0: 0.901010101010101
Recall for Class 1: 0.6137724550898204
F1 Score Macro: 0.8256994555668637
