In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/19 17:21:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Logistic regression with pyspark

## Import data

In [3]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
df.drop('day','month','poutcome').show(5)

+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



###  Deal with categorical data and Convert the data to dense vector

In [5]:
catcols = ['job','marital','education','default','housing','loan','contact','poutcome']
num_cols = ['balance', 'duration','campaign','pdays','previous']
labelCol = 'y'


## Process categorical columns

The following code does three things with pipeline:

* **`StringIndexer`** all categorical columns
* **`OneHotEncoder`** all categorical index columns
* **`VectorAssembler`** all feature columns into one vector column

### Categorical columns

In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [7]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [8]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [9]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [10]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

23/04/19 17:23:13 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+---------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                 |label|
+---------------------------------------------------------------------------------------------------------+-----+
|(29,[8,11,15,16,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1787.0,79.0,1.0,-1.0])                |no   |
|(29,[4,11,13,16,17,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4789.0,220.0,1.0,339.0,4.0])       |no   |
|(29,[0,12,14,16,17,18,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1350.0,185.0,1.0,330.0,1.0])|no   |
|(29,[0,11,14,16,17,20,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1476.0,199.0,4.0,-1.0])               |no   |
|(29,[1,11,13,16,17,18

### We need to deal with label, which is string, yes or no, need to make them numbers

### Build StringIndexer stages

In [11]:
# Index labels, adding metadata to the label column 
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
data=labelIndexer.transform(data)

In [12]:
data.show(5)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(29,[8,11,15,16,1...|   no|         0.0|
|(29,[4,11,13,16,1...|   no|         0.0|
|(29,[0,12,14,16,1...|   no|         0.0|
|(29,[0,11,14,16,1...|   no|         0.0|
|(29,[1,11,13,16,1...|   no|         0.0|
+--------------------+-----+------------+
only showing top 5 rows



In [13]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
# Update metadata accordingly.
featureIndexer =VectorIndexer(inputCol="features", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=4).fit(data)


In [14]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+--------------------+
|            features|label|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|(29,[8,11,15,16,1...|   no|         0.0|(29,[8,11,15,16,1...|
|(29,[4,11,13,16,1...|   no|         0.0|(29,[4,11,13,16,1...|
|(29,[0,12,14,16,1...|   no|         0.0|(29,[0,12,14,16,1...|
|(29,[0,11,14,16,1...|   no|         0.0|(29,[0,11,14,16,1...|
|(29,[1,11,13,16,1...|   no|         0.0|(29,[1,11,13,16,1...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



### Split the data to training and test data sets

In [15]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|features                                                                                        |label|indexedLabel|indexedFeatures                                                                                 |
+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-105.0,60.0,2.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,1

## Build cross-validation model

### Estimator

In [16]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

### Pipeline Architecture

In [17]:
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)


In [18]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[logr,labelConverter])

In [19]:
 # Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

### Make predictions

In [20]:
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display. 
predictions.select("features","label","predictedLabel", "rawPrediction").show(5)
#predictions.show(5, False)

+--------------------+-----+--------------+--------------------+
|            features|label|predictedLabel|       rawPrediction|
+--------------------+-----+--------------+--------------------+
|(29,[0,11,13,16,1...|   no|            no|[3.47194124746098...|
|(29,[0,11,13,16,1...|   no|            no|[0.41059638603484...|
|(29,[0,11,13,16,1...|   no|            no|[3.57423703214259...|
|(29,[0,11,13,16,1...|   no|            no|[3.19746826464175...|
|(29,[0,11,13,16,1...|   no|            no|[2.10430104061696...|
+--------------------+-----+--------------+--------------------+
only showing top 5 rows



### Evaluation

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.8996579247434435
Test Error = 0.100342


### Evaluate training model

area under ROC  https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

accuracy

False positive rate by label

True positive rate by label

Precision by label

Recall by label

F-measure by label


In [22]:
lrModel = model.stages[0]
trainingSummary = lrModel.summary


In [23]:
# Obtain the objective per iteration
# objectiveHistory = trainingSummary.objectiveHistory
# print("objectiveHistory:")
# for objective in objectiveHistory:
#     print(objective)
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show(5)
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))
print(f"accuracy: {str(trainingSummary.accuracy)}")


+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|0.006153846153846154|
|4.095004095004095E-4|0.009230769230769232|
| 8.19000819000819E-4|0.012307692307692308|
|0.001228501228501...|0.015384615384615385|
+--------------------+--------------------+
only showing top 5 rows

areaUnderROC: 0.8859264159264151
accuracy: 0.9013371882905674


In [24]:
predictions.show()

+--------------------+-----+------------+--------------------+--------------------+--------------------+----------+--------------+
|            features|label|indexedLabel|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+--------------------+-----+------------+--------------------+--------------------+--------------------+----------+--------------+
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[3.47194124746098...|[0.96987878174128...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[0.41059638603484...|[0.60123087240682...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[3.57423703214259...|[0.97272781620283...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[3.19746826464175...|[0.96073889228755...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[2.10430104061696...|

In [25]:
# for multiclass, we can inspect metrics on a per-label basis 
print("False positive rate by label:")
for i in range(len(trainingSummary.falsePositiveRateByLabel)):
    print("label {}:{}".format(i,trainingSummary.falsePositiveRateByLabel[i]))

False positive rate by label:
label 0:0.68
label 1:0.021294021294021293


In [26]:
print("True positive rate by label:")
for i in range(len(trainingSummary.truePositiveRateByLabel)):
    print("label {}:{}".format(i,trainingSummary.truePositiveRateByLabel[i]))


True positive rate by label:
label 0:0.9787059787059788
label 1:0.32


In [27]:
print("Precision by label:")
for i in range(len(trainingSummary.precisionByLabel)):
    print("label {}:{}".format(i,trainingSummary.precisionByLabel[i]))


Precision by label:
label 0:0.9153581003446956
label 1:0.6666666666666666


In [28]:
print("Recall by label:")
for i in range(len(trainingSummary.recallByLabel)):
    print("label {}:{}".format(i,trainingSummary.recallByLabel[i]))


Recall by label:
label 0:0.9787059787059788
label 1:0.32


In [29]:
print("F-measure by label:")
for i in range(len(trainingSummary.fMeasureByLabel())):
    print("label {}:{}".format(i,trainingSummary.fMeasureByLabel()[i])) 

F-measure by label:
label 0:0.9459726894913913
label 1:0.43243243243243246


In [30]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate 
truePositiveRate = trainingSummary.weightedTruePositiveRate 
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: {0}\nFPR: {1}\nTPR: {2}\nF-measure: {3}\nPrecision: {4}\nRec all: {5}".format(accuracy,falsePositiveRate,truePositiveRate,fMeasure,precision,recall))


Accuracy: 0.9013371882905674
FPR: 0.6026312095845887
TPR: 0.9013371882905674
F-measure: 0.8856544446254131
Precision: 0.8861478668985953
Rec all: 0.9013371882905674


### Here is the slope W and intercept b of the line, z = w * x + b

In [31]:
lrModel.coefficients

DenseVector([-0.6348, -0.9714, -0.5636, -0.3424, -0.7155, 0.0696, -0.5471, -1.0791, -1.1008, -0.5507, 0.1898, -0.3011, -0.1876, 0.5846, 0.9396, 0.5759, -0.26, -0.5714, 0.5781, 0.0163, -1.2559, -2.5878, -2.6568, -1.8953, 0.0, 0.0041, -0.094, 0.0005, -0.044])

In [32]:
lrModel.intercept

-0.43740267340031425