Code based on: Bowles, M. (2019). Machine Learning with Spark and Python: Essential Techniques for Predictive Analytics. John Wiley & Sons.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
spark = SparkSession.builder.appName("connectionist-bench").getOrCreate()

In [None]:
rm = spark.read.format("csv").option("header", "False")\
     .option("inferSchema", "True")\
     .load("Data/connectionist-bench/sonar.all-data")

In [None]:
inputs = rm.columns[:-1]
string_indexer_class = StringIndexer(inputCol = "_c60", outputCol = 'label')

In [None]:
vecAssembler = VectorAssembler(inputCols=inputs, outputCol="features")
stages = [string_indexer_class, vecAssembler]

In [None]:
pipeline = Pipeline(stages = stages)
pipeline_mod = pipeline.fit(rm)
rm_transfd = pipeline_mod.transform(rm)
inp_rm = rm_transfd.select(['features', 'label'])

In [None]:
inp_rm.show(3)

In [None]:
train_rm, test_rm = inp_rm.randomSplit([0.7, 0.3], seed = 1234)

In [None]:
logr_rm = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)

In [None]:
logr_mod_rm = logr_rm.fit(train_rm)

In [None]:
weights_sorted = np.sort(logr_mod_rm.coefficients)

In [None]:
plt.plot(weights_sorted)
plt.ylabel('Coefficient Value')
plt.xlabel('Order')
plt.title('Ordered Coefficients')
plt.show()

In [None]:
n_weights = len(logr_mod_rm.coefficients)
plt.plot(list(zip(range(n_weights))), logr_mod_rm.coefficients)
plt.show()

In [None]:
train_rm_summary = logr_mod_rm.summary
roc_rm = train_rm_summary.roc.toPandas()

In [None]:
plt.plot(roc_rm['FPR'], roc_rm['TPR'])
plt.title("Training ROC curve")
plt.xlabel("TPR")
plt.ylabel("FPR")
plt.show()
train_rm_summary.areaUnderROC

In [None]:
logr_rm_preds = logr_mod_rm.transform(test_rm)
logr_rm_preds.select('rawPrediction', 'prediction', 'probability').show(4)
binclass_eval = BinaryClassificationEvaluator()
print('Test AOC:', binclass_eval.evaluate(logr_rm_preds))