In [None]:
# (1) Import the required Python dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
# (2) Instantiate a Spark Context
conf = SparkConf().setMaster("local").setAppName("Logistic Regression - Breast Cancer")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [None]:
# (3) Load the Breast Cancer dataset (data/breast-cancer-data/dataR2.csv) into a Spark DataFrame
breast_cancer_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('./data/breast-cancer-data/dataR2.csv')
breast_cancer_df = breast_cancer_df.withColumnRenamed('MCP.1', 'MCP_1')
indexer = StringIndexer(inputCol = "Classification", outputCol = "label").fit(breast_cancer_df)
breast_cancer_df = indexer.transform(breast_cancer_df)
breast_cancer_df.show(10)

In [None]:
# (4) Calculate standard statistical descriptive analytics on the raw Breast Cancer Spark DataFrame
breast_cancer_df.describe().toPandas().transpose()

In [None]:
# (5) Generate Input Feature Vectors from the Raw Spark DataFrame
feature_columns = ['Age', 'BMI', 'Glucose', 'Insulin', 'HOMA', 'Leptin', 'Adiponectin', 'Resistin', 'MCP_1']
label_column = 'label'
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
breast_cancer_features_df = vector_assembler.transform(breast_cancer_df).select(['features', label_column])
breast_cancer_features_df.head(10)

In [None]:
# (6) Split the Raw DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = breast_cancer_features_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

In [None]:
# (7) Train a Logistic Regression Model on the Training DataFrame
logistic_regression = LogisticRegression(featuresCol = 'features', labelCol = label_column)
logistic_regression_model = logistic_regression.fit(train_df)

In [None]:
# (8) Output Logistic Regression Model Summary Statistics to evaluate the Training Model
print("Model Coefficients: " + str(logistic_regression_model.coefficientMatrix))
print("Intercept: " + str(logistic_regression_model.interceptVector))

In [None]:
# (9) Apply the Trained Logistic Regression Model to the Test DataFrame to make predictions
test_logistic_regression_predictions_df = logistic_regression_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_logistic_regression_predictions_df.select("probability", "rawPrediction", "prediction", label_column, "features").show()

In [None]:
# (10) Evaluate the performance of our Logistic Regression Model on the Test DataFrame using Area under a ROC curve
test_summary = logistic_regression_model.evaluate(test_df)
roc = test_summary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
evaluator_roc_area = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", labelCol = label_column, metricName = "areaUnderROC")
print("Area Under ROC Curve on Test Data = %g" % evaluator_roc_area.evaluate(test_logistic_regression_predictions_df))

In [None]:
# (11) Generate a Confusion/Classification Matrix
N = test_logistic_regression_predictions_df.count()
true_positives = test_logistic_regression_predictions_df.filter( col("prediction") == 1.0 ).filter( col("label") == 1.0 ).count()
true_negatives = test_logistic_regression_predictions_df.filter( col("prediction") == 0.0 ).filter( col("label") == 0.0 ).count()
false_positives = test_logistic_regression_predictions_df.filter( col("prediction") == 1.0 ).filter( col("label") == 0.0 ).count()
false_negatives = test_logistic_regression_predictions_df.filter( col("prediction") == 0.0 ).filter( col("label") == 1.0 ).count()
print("N = %g" % N)
print("Overall Accuracy = %g" % ((true_negatives + true_positives)/N))
print("Overall Error Rate = %g" % ((false_negatives + false_positives)/N))
print("Sensitivity = %g" % (true_positives / (true_positives + false_negatives)))
print("Specificity = %g" % (true_negatives / (true_negatives + false_positives)))

In [None]:
# (12) Alternatively we can generate the same Classification Matrix using the MLLib RDD API (Maintenance Mode as of Spark 2.3.2)
predictions_and_label = test_logistic_regression_predictions_df.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictions_and_label)
print("N = %g" % N)
print(metrics.confusionMatrix())

In [None]:
# (13) Stop the Spark Context
sc.stop()