**Author:** Cainã Max Couto da Silva  
**LinkedIn:** [@cmcouto-silva](https://www.linkedin.com/in/cmcouto-silva/)

# Setup

## Spark Session / UI

In [1]:
!pip install -q pyngrok # for accessing Spark UI
!pip install -q pyspark # for Spark session

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# Create Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.config('spark.ui.port', '4050').getOrCreate()

In [4]:
spark

In [5]:
# Login into https://dashboard.ngrok.com/get-started/setup to get your own token
# ngrok_token = '27WqUGZ1SkzPl4bQunMXJyasbWZ_5pCgLCsppL7ufk9rzK7j4'

In [6]:
# # Make local Spark UI URL available at ngrok
# get_ipython().system_raw(f'ngrok authtoken {ngrok_token}')
# get_ipython().system_raw('ngrok http 4050 &')
# !sleep 3
# print('URL para interface Spark:')
# !curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'

## Importing libraries

In [7]:
import numpy as np
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator

## Loading dataset

In [8]:
df = pd.read_excel('https://public.dhe.ibm.com/software/data/sw-library/cognos/mobile/C11/data/Telco_customer_churn.xlsx', index_col=0)
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df.dropna(subset=['Total Charges'], inplace=True)

with pd.option_context('display.max_columns', None):
    display(df)

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.307420,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,1,67,2701,Moved
9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.50,Yes,1,86,5372,Moved
7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.80,3046.05,Yes,1,84,5003,Moved
0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.70,5036.30,Yes,1,89,5340,Competitor had better devices
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2569-WGERO,1,United States,California,Landers,92285,"34.341737, -116.539416",34.341737,-116.539416,Female,No,No,No,72,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),21.15,1419.40,No,0,45,5306,
6840-RESVB,1,United States,California,Adelanto,92301,"34.667815, -117.536183",34.667815,-117.536183,Male,No,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No,0,59,2140,
2234-XADUH,1,United States,California,Amboy,92304,"34.559882, -115.637164",34.559882,-115.637164,Female,No,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No,0,71,5560,
4801-JZAZL,1,United States,California,Angelus Oaks,92305,"34.1678, -116.86433",34.167800,-116.864330,Female,No,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No,0,59,2793,


In [9]:
target_variable = 'Churn Value'

numerical_features = [
    'Tenure Months',
    'Monthly Charges',
    'CLTV',
    'Total Charges'
]

categorical_features = [
    'Senior Citizen',
    'Partner',
    'Dependents',
    'Phone Service',
    'Multiple Lines',
    'Internet Service',
    'Online Security',
    'Online Backup',
    'Device Protection',
    'Tech Support',
    'Streaming TV',
    'Streaming Movies',
    'Contract',
    'Paperless Billing',
    'Payment Method',
    ]

sdf = spark.createDataFrame(df[numerical_features+categorical_features+[target_variable]])
print(f'Num. partitions: {sdf.rdd.getNumPartitions()}')
sdf.show(5)

Num. partitions: 2
+-------------+---------------+----+-------------+--------------+-------+----------+-------------+--------------+----------------+---------------+-------------+-----------------+------------+------------+----------------+--------------+-----------------+--------------------+-----------+
|Tenure Months|Monthly Charges|CLTV|Total Charges|Senior Citizen|Partner|Dependents|Phone Service|Multiple Lines|Internet Service|Online Security|Online Backup|Device Protection|Tech Support|Streaming TV|Streaming Movies|      Contract|Paperless Billing|      Payment Method|Churn Value|
+-------------+---------------+----+-------------+--------------+-------+----------+-------------+--------------+----------------+---------------+-------------+-----------------+------------+------------+----------------+--------------+-----------------+--------------------+-----------+
|            2|          53.85|3239|       108.15|            No|     No|        No|          Yes|            No|    

# PySpark Pipeline

In [10]:
data = (
    sdf[[*numerical_features, *categorical_features, target_variable]]
    .withColumnRenamed(target_variable, 'label')
)
train, test = data.randomSplit([.8, .2])

## Categorical features

In [11]:
indexer = StringIndexer(inputCols=categorical_features, outputCols=[f'cat_{col}' for col in categorical_features])
encoder = OneHotEncoder(inputCols=[f'cat_{col}' for col in categorical_features], outputCols=[f'vec_{col}' for col in categorical_features])

categorical_preprocessor = Pipeline(stages=[
    indexer,
    encoder
])

In [None]:
categorical_preprocessor.fit(train).transform(train).show(5)

+-------------+---------------+----+-------------+--------------+-------+----------+-------------+--------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+-----------------+--------------------+-----+------------------+-----------+--------------+-----------------+------------------+--------------------+-------------------+-----------------+---------------------+----------------+----------------+--------------------+------------+---------------------+------------------+------------------+-------------+--------------+-----------------+------------------+--------------------+-------------------+-----------------+---------------------+----------------+----------------+--------------------+-------------+---------------------+------------------+
|Tenure Months|Monthly Charges|CLTV|Total Charges|Senior Citizen|Partner|Dependents|Phone Service|Multiple Lines|Internet Service|    Online

In [12]:
(
  categorical_preprocessor.fit(train).transform(train)
  .select('Contract','cat_Contract','vec_Contract')
  .drop_duplicates()
  .show(5)
)

+--------------+------------+-------------+
|      Contract|cat_Contract| vec_Contract|
+--------------+------------+-------------+
|Month-to-month|         0.0|(2,[0],[1.0])|
|      Two year|         1.0|(2,[1],[1.0])|
|      One year|         2.0|    (2,[],[])|
+--------------+------------+-------------+



## Numerical features

In [13]:
numerical_assembler = VectorAssembler().setInputCols(numerical_features).setOutputCol('numerical_features')
numerical_scaler = StandardScaler(inputCol='numerical_features', outputCol='scaled_numerical_features', withMean=True, withStd=True) # default: withMean=False

numerical_preprocessor = Pipeline(stages=[
    numerical_assembler,
    numerical_scaler
])

In [14]:
numerical_preprocessor.fit(train).transform(train).show(5)

+-------------+---------------+----+-------------+--------------+-------+----------+-------------+--------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+-----------------+--------------------+-----+--------------------+-------------------------+
|Tenure Months|Monthly Charges|CLTV|Total Charges|Senior Citizen|Partner|Dependents|Phone Service|Multiple Lines|Internet Service|    Online Security|      Online Backup|  Device Protection|       Tech Support|       Streaming TV|   Streaming Movies|      Contract|Paperless Billing|      Payment Method|label|  numerical_features|scaled_numerical_features|
+-------------+---------------+----+-------------+--------------+-------+----------+-------------+--------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+-----------------+-

## Categorical and numerical preprocessors

In [15]:
feature_cols = [f'vec_{col}' for col in categorical_features] + ['scaled_numerical_features']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

preprocessor = Pipeline(stages=[
    numerical_preprocessor,
    categorical_preprocessor,
    assembler
])

preprocessor.fit(train).transform(train).show(5)

+-------------+---------------+----+-------------+--------------+-------+----------+-------------+--------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+-----------------+--------------------+-----+--------------------+-------------------------+------------------+-----------+--------------+-----------------+------------------+--------------------+-------------------+-----------------+---------------------+----------------+----------------+--------------------+------------+---------------------+------------------+------------------+-------------+--------------+-----------------+------------------+--------------------+-------------------+-----------------+---------------------+----------------+----------------+--------------------+-------------+---------------------+------------------+--------------------+
|Tenure Months|Monthly Charges|CLTV|Total Charges|Senior Citizen|Partner

## Model Pipeline

In [16]:
lr = LogisticRegression(featuresCol='features', labelCol='label')
model_pipeline = Pipeline(stages=[preprocessor, lr])
type(model_pipeline)

pyspark.ml.pipeline.Pipeline

In [17]:
trained_model_pipeline = model_pipeline.fit(train) # it's not inplace!!
type(trained_model_pipeline)

pyspark.ml.pipeline.PipelineModel

In [18]:
trained_model_pipeline.transform(test)

DataFrame[Tenure Months: bigint, Monthly Charges: double, CLTV: bigint, Total Charges: double, Senior Citizen: string, Partner: string, Dependents: string, Phone Service: string, Multiple Lines: string, Internet Service: string, Online Security: string, Online Backup: string, Device Protection: string, Tech Support: string, Streaming TV: string, Streaming Movies: string, Contract: string, Paperless Billing: string, Payment Method: string, label: bigint, numerical_features: vector, scaled_numerical_features: vector, cat_Senior Citizen: double, cat_Partner: double, cat_Dependents: double, cat_Phone Service: double, cat_Multiple Lines: double, cat_Internet Service: double, cat_Online Security: double, cat_Online Backup: double, cat_Device Protection: double, cat_Tech Support: double, cat_Streaming TV: double, cat_Streaming Movies: double, cat_Contract: double, cat_Paperless Billing: double, cat_Payment Method: double, vec_Senior Citizen: vector, vec_Partner: vector, vec_Dependents: vector

In [19]:
trained_model_pipeline.stages[-1].summary # BinaryLogisticRegressionTrainingSummary

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x7c87b21ca1d0>

In [20]:
trained_model_pipeline.stages[-1].summary.accuracy

0.8118671167170013

In [21]:
print( trained_model_pipeline.stages[-1].explainParams() )

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

# Pyspark Tuning

## Train Validation Split

In [22]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.1, 0.01])
    .build()
)

tvs = TrainValidationSplit(
    estimator=model_pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(),
    trainRatio=0.8
)

tvs_model = tvs.fit(train)

In [23]:
type(tvs_model)

pyspark.ml.tuning.TrainValidationSplitModel

In [24]:
best_model = tvs_model.bestModel

In [25]:
best_model.stages[-1].getRegParam()

0.01

## Cross-validation

In [26]:
# dt = DecisionTreeClassifier(featuresCol='features', labelCol=target_variable)
model_pipeline = Pipeline(stages=[preprocessor, lr])

In [27]:
cross_validator = CrossValidator(
    estimator=model_pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(),
    numFolds=3
)

cv_model = cross_validator.fit(train)

In [28]:
type(cv_model)

pyspark.ml.tuning.CrossValidatorModel

In [29]:
best_model = cv_model.bestModel

In [30]:
best_model.stages[-1].getRegParam()

0.01