In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/25 16:44:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Logistic regression with pyspark

## Import data

In [3]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
#df.drop('day','month','poutcome').show(5)

In [4]:
df.show(5)
df.printSchema()

+---+-----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199|       4|   -1|       0| unknown| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|  5|  may|     22

### Apply abs() to all numeric columns as Naive Bayes does not allow negative features

In [5]:
df.createOrReplaceTempView("dfView")
df=spark.sql("select job,marital,education,default,housing,loan,contact,poutcome,\
  abs(balance) balance, abs(duration) duration, abs(campaign) campaign,\
  abs(pdays) pdays,abs(previous) previous, y from dfView")

###  Deal with categorical data and Convert the data to  vector

In [6]:
catcols = ['job','marital','education','default','housing','loan','contact','poutcome']
num_cols = ['balance', 'duration','campaign','pdays','previous']
labelCol = 'y'


## Process categorical columns

The following code does three things with pipeline:

* **`StringIndexer`** all categorical columns
* **`OneHotEncoder`** all categorical index columns
* **`VectorAssembler`** all feature columns into one vector column

### Categorical columns

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Normalizer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [8]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [9]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [10]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [11]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

23/04/25 16:44:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+---------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                 |label|
+---------------------------------------------------------------------------------------------------------+-----+
|(29,[8,11,15,16,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1787.0,79.0,1.0,1.0])                 |no   |
|(29,[4,11,13,16,17,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4789.0,220.0,1.0,339.0,4.0])       |no   |
|(29,[0,12,14,16,17,18,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1350.0,185.0,1.0,330.0,1.0])|no   |
|(29,[0,11,14,16,17,20,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1476.0,199.0,4.0,1.0])                |no   |
|(29,[1,11,13,16,17,18

### We need normalize all features to make the value range of each feature similar scale

In [12]:
normalizer = Normalizer(inputCol="features",outputCol="normFeatures",p=1.0)

In [13]:
data=normalizer.transform(data)


In [14]:
data.show(5, False)


+---------------------------------------------------------------------------------------------------------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                 |label|normFeatures                                                                                                                                                                                                                                                                                                |
+---------------------------------------------------------------------------------------------------------+-----+---------------------------------------------------------

### We need to deal with label, which is string, yes or no, need to make them numbers
### Build StringIndexer stages

In [15]:
# Index labels, adding metadata to the label column 
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
data=labelIndexer.transform(data)

In [16]:
data.show(5)

+--------------------+-----+--------------------+------------+
|            features|label|        normFeatures|indexedLabel|
+--------------------+-----+--------------------+------------+
|(29,[8,11,15,16,1...|   no|(29,[8,11,15,16,1...|         0.0|
|(29,[4,11,13,16,1...|   no|(29,[4,11,13,16,1...|         0.0|
|(29,[0,12,14,16,1...|   no|(29,[0,12,14,16,1...|         0.0|
|(29,[0,11,14,16,1...|   no|(29,[0,11,14,16,1...|         0.0|
|(29,[1,11,13,16,1...|   no|(29,[1,11,13,16,1...|         0.0|
+--------------------+-----+--------------------+------------+
only showing top 5 rows



In [17]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
# Update metadata accordingly.
featureIndexer =VectorIndexer(inputCol="normFeatures", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=4).fit(data)


In [18]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+-----+--------------------+------------+--------------------+
|            features|label|        normFeatures|indexedLabel|     indexedFeatures|
+--------------------+-----+--------------------+------------+--------------------+
|(29,[8,11,15,16,1...|   no|(29,[8,11,15,16,1...|         0.0|(29,[8,11,15,16,1...|
|(29,[4,11,13,16,1...|   no|(29,[4,11,13,16,1...|         0.0|(29,[4,11,13,16,1...|
|(29,[0,12,14,16,1...|   no|(29,[0,12,14,16,1...|         0.0|(29,[0,12,14,16,1...|
|(29,[0,11,14,16,1...|   no|(29,[0,11,14,16,1...|         0.0|(29,[0,11,14,16,1...|
|(29,[1,11,13,16,1...|   no|(29,[1,11,13,16,1...|         0.0|(29,[1,11,13,16,1...|
+--------------------+-----+--------------------+------------+--------------------+
only showing top 5 rows



### Split the data to training and test data sets

In [19]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

+-----------------------------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                       |label|normFeatures                                                                                                                                                                    

## Build cross-validation model


### Cross-Validation

In general, one round of cross-validation involves partitioning a sample of data into complementary subsets, performing the analysis on one subset (called the training set), and validating the analysis on the other subset (called the validation set or testing set).
CrossValidator begins by splitting the dataset into a set of folds which are used as separate training and test datasets. E.g., with k=3 folds, CrossValidator will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. To evaluate a particular ParamMap, CrossValidator computes the average evaluation metric for the 3 Models produced by fitting the Estimator on the 3 different (training, test) dataset pairs.

After identifying the best ParamMap, CrossValidator finally re-fits the Estimator using the best ParamMap and the entire dataset.  In simple term, pickup the ParamMap that produces the best model and to use that model for subsequent transform().


### Estimator

In [20]:
from pyspark.ml.classification import NaiveBayes
naivebayes = NaiveBayes(featuresCol="normFeatures", labelCol="indexedLabel")

### Pipeline Architecture

In [21]:
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)


In [22]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[naivebayes,labelConverter])

In [23]:
 # Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

### Make predictions

In [24]:
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display. 
predictions.select("features","label","predictedLabel", "rawPrediction").show(5)
#predictions.show(5, False)

+--------------------+-----+--------------+--------------------+
|            features|label|predictedLabel|       rawPrediction|
+--------------------+-----+--------------+--------------------+
|(29,[0,11,13,16,1...|   no|            no|[-1.1737243598968...|
|(29,[0,11,13,16,1...|   no|            no|[-1.1874075823042...|
|(29,[0,11,13,16,1...|   no|            no|[-0.8444796851804...|
|(29,[0,11,13,16,1...|   no|            no|[-0.8101712555432...|
|(29,[0,11,13,16,1...|   no|            no|[-0.7335767665447...|
+--------------------+-----+--------------+--------------------+
only showing top 5 rows



### Evaluation

### BinaryClassificationEvaluator only provide accuracy metrics, you need MulticlassClassificationEvaluator to provide all metrics, MulticlassClassificationEvaluator work with binary classification too

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.887213847012842
Test Error = 0.112786


### Evaluate training model

area under ROC  https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

accuracy

False positive rate by label

True positive rate by label

Precision by label

Recall by label

F-measure by label


In [26]:
print('training data (f1):', evaluator.setMetricName('f1').evaluate(predictions), "\n",
     'training data (weightedPrecision): ', evaluator.setMetricName('weightedPrecision').evaluate(predictions),"\n",
     'training data (weightedRecall): ', evaluator.setMetricName('weightedRecall').evaluate(predictions),"\n",
     'training data (accuracy): ', evaluator.setMetricName('accuracy').evaluate(predictions))


training data (f1): 0.8341910076351514 
 training data (weightedPrecision):  0.7871484103313267 
 training data (weightedRecall):  0.887213847012842 
 training data (accuracy):  0.887213847012842
