In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/19 17:29:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#Load training data
data = spark.read.format("libsvm").load("data/SparkData/sample_multiclass_classification_data.txt")

23/04/19 17:29:28 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [4]:
data.select('label').distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
|  2.0|
+-----+



In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  1.0|(4,[0,2,3],[-0.83...|
|  2.0|(4,[0,1,2,3],[-1....|
|  2.0|(4,[0,1,2,3],[-1....|
|  1.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,2,3],[0.611...|
|  0.0|(4,[0,1,2,3],[0.2...|
|  1.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  2.0|(4,[0,1,2,3],[-0....|
|  1.0|(4,[0,2,3],[-0.94...|
|  2.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[0.1...|
|  2.0|(4,[0,1,2,3],[-0....|
+-----+--------------------+
only showing top 20 rows



In [6]:
trainingData, testData = data.randomSplit([0.8,0.2])

In [7]:
lr = LogisticRegression(maxIter=10,regParam=0.3,elasticNetParam=0.8, \
                        featuresCol='features', labelCol='label')

In [8]:
#Fit the model
model=lr.fit(trainingData)

23/04/19 17:29:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/19 17:29:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [9]:
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display. 
predictions.show(5)
#predictions.show(5, False)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(4,[0,1,2,3],[-0....|[0.23312639945627...|[0.43607729162398...|       0.0|
|  0.0|(4,[0,1,2,3],[-0....|[0.36153537709673...|[0.47927564629652...|       0.0|
|  0.0|(4,[0,1,2,3],[-0....|[0.20744472720070...|[0.42737998054428...|       0.0|
|  0.0|(4,[0,1,2,3],[-1....|[0.20744472720070...|[0.42331846871949...|       0.0|
|  0.0|(4,[0,1,2,3],[0.0...|[0.33585339665986...|[0.47600553321122...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [10]:
predictions.select('label', 'rawPrediction', 'probability', 'prediction').filter('label = 2.0').show(5, False)

+-----+---------------------------------------------------------------+------------------------------------------------------------+----------+
|label|rawPrediction                                                  |probability                                                 |prediction|
+-----+---------------------------------------------------------------+------------------------------------------------------------+----------+
|2.0  |[0.0019904246122268987,0.00927824373871125,0.12291786546544656]|[0.3188920452345591,0.321224561937253,0.359883392828188]    |2.0       |
|2.0  |[0.13039940225268692,-0.4077311753784001,0.12291786546544656]  |[0.38814073944016914,0.22661157460288997,0.3852476859569409]|0.0       |
|2.0  |[0.027672096867798986,-0.10010548192079238,0.12291786546544656]|[0.33557291915227994,0.2953206451368201,0.36910643571089996]|2.0       |
|2.0  |[0.07903573712287779,-0.2669093097923858,0.12291786546544656]  |[0.36331789152363697,0.2570660712276313,0.3796160372487318] |2.0 

In [11]:
# Print the coefficients and intercept for multinomial logistic regression 
print("Coefficients: {}".format(model.coefficientMatrix)) 
print("Intercepts: {}".format(model.interceptVector))

Coefficients: 3 X 4 CSRMatrix
(0,3) 0.3082
(1,2) -0.7665
(1,3) -0.3772
Intercepts: [0.07903574956024295,-0.2019536150256895,0.12291786546544656]


In [12]:
trainingSummary = model.summary

In [13]:
# for multiclass, we can inspect metrics on a per-label basis 
print("False positive rate by label:")
for i in range(len(trainingSummary.falsePositiveRateByLabel)):
    print("label {}:{}".format(i,trainingSummary.falsePositiveRateByLabel[i]))

False positive rate by label:
label 0:0.17567567567567569
label 1:0.0
label 2:0.0136986301369863


In [14]:
print("True positive rate by label:")
for i in range(len(trainingSummary.truePositiveRateByLabel)):
    print("label {}:{}".format(i,trainingSummary.truePositiveRateByLabel[i]))


True positive rate by label:
label 0:0.9743589743589743
label 1:1.0
label 2:0.675


In [15]:
print("Precision by label:")
for i in range(len(trainingSummary.precisionByLabel)):
    print("label {}:{}".format(i,trainingSummary.precisionByLabel[i]))


Precision by label:
label 0:0.7450980392156863
label 1:1.0
label 2:0.9642857142857143


In [16]:
print("Recall by label:")
for i in range(len(trainingSummary.recallByLabel)):
    print("label {}:{}".format(i,trainingSummary.recallByLabel[i]))


Recall by label:
label 0:0.9743589743589743
label 1:1.0
label 2:0.675


In [17]:
print("F-measure by label:")
for i in range(len(trainingSummary.fMeasureByLabel())):
    print("label {}:{}".format(i,trainingSummary.fMeasureByLabel()[i])) 

F-measure by label:
label 0:0.8444444444444443
label 1:1.0
label 2:0.7941176470588236


In [18]:
accuracy = trainingSummary.accuracy

In [19]:
falsePositiveRate = trainingSummary.weightedFalsePositiveRate 
truePositiveRate = trainingSummary.weightedTruePositiveRate 
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: {0}\nFPR: {1}\nTPR: {2}\nF-measure: {3}\nPrecision: {4}\nRec all: {5}".format(accuracy,falsePositiveRate,truePositiveRate,fMeasure,precision,recall))


Accuracy: 0.8761061946902655
FPR: 0.06548050050292747
TPR: 0.8761061946902655
F-measure: 0.8734339753600555
Precision: 0.8993827619543393
Rec all: 0.8761061946902655


In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))


Test Error = 0.0540541


### There are 3 lines determines 3 classes (3 predictive values), hence, 3 slopes and 3 intercepts. 

In [26]:
model.coefficientMatrix.toDense()

DenseMatrix(3, 4, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.7665, 0.0, 0.3082, -0.3772, 0.0], False)

In [24]:
model.interceptVector

DenseVector([0.079, -0.202, 0.1229])