# step 1: Create a Databricks notebook

1. **Go to Workspace** →** Create → Notebook**
2. Select:
- **Language:** Python
- **Cluster:**(Create cluster / use serverless)

In [0]:
from pyspark.sql import SparkSession

data = [
    (1, 899, 24, 1, 0),
    (2, 1299, 6, 5, 1),
    (3, 699, 12, 2, 0),
    (4, 1599, 3, 6, 1),
    (5, 499, 30, 0, 0)
]

columns = ["customer_id", "monthly_charge", "tenure_months", "support_calls", "churn"]

df = spark.createDataFrame(data, columns)
df.show()


+-----------+--------------+-------------+-------------+-----+
|customer_id|monthly_charge|tenure_months|support_calls|churn|
+-----------+--------------+-------------+-------------+-----+
|          1|           899|           24|            1|    0|
|          2|          1299|            6|            5|    1|
|          3|           699|           12|            2|    0|
|          4|          1599|            3|            6|    1|
|          5|           499|           30|            0|    0|
+-----------+--------------+-------------+-------------+-----+



In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["monthly_charge", "tenure_months", "support_calls"],
    outputCol="features"
)

df_ml = assembler.transform(df).select("features", "churn")
df_ml.show()


+----------------+-----+
|        features|churn|
+----------------+-----+
|[899.0,24.0,1.0]|    0|
|[1299.0,6.0,5.0]|    1|
|[699.0,12.0,2.0]|    0|
|[1599.0,3.0,6.0]|    1|
|[499.0,30.0,0.0]|    0|
+----------------+-----+



In [0]:
train, test = df_ml.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="churn", featuresCol="features")
model = lr.fit(train)


In [0]:
predictions = model.transform(test)
predictions.select("features", "churn", "prediction", "probability").show(truncate=False)


+----------------+-----+----------+-----------------------------------------+
|features        |churn|prediction|probability                              |
+----------------+-----+----------+-----------------------------------------+
|[1299.0,6.0,5.0]|1    |1.0       |[4.374657798958658E-4,0.9995625342201041]|
+----------------+-----+----------+-----------------------------------------+



In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="churn",
    rawPredictionCol="prediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print("AUC Score:", auc)


AUC Score: 1.0
