In [53]:
# %conda install -y openjdk
# %conda install -y pyspark
# %conda install -y -c conda-forge findspark

In [54]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from pyspark.ml.classification import FMClassifier, FMClassificationSummary
from sklearn.datasets import load_breast_cancer
from pandas import DataFrame, Series

In [55]:
# start spark session
spark = SparkSession.builder.appName("FMClassifier Example").getOrCreate()

23/12/18 14:18:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Preparing the data

In [56]:
# load breast cancer dataset
bc_dataset = load_breast_cancer()
print(bc_dataset['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [57]:
bc_dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [58]:
# convert dataset into a spark dataframe
df_pandas = DataFrame(bc_dataset.data, columns=bc_dataset.feature_names)
df_pandas['label'] = Series(bc_dataset.target)
df = spark.createDataFrame(df_pandas)
df.printSchema()

root
 |-- mean radius: double (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst 

In [59]:
print("Number of columns:", len(df.columns))
df.show(5)

Number of columns: 31


[Stage 0:>                                                          (0 + 1) / 1]

+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+-----+
|mean radius|mean texture|mean perimeter|mean area|mean smoothness|mean compactness|mean concavity|mean concave points|mean symmetry|mean fractal dimension|radius error|texture error|perimeter error|area error|smoothness error|compactness error|concavity error|concave points error|symmetry error|fractal dimension error|worst radius|worst texture|worst perimeter|worst area|worst smoothness|worst compactness|worst concavity|worst concave points|worst symmetry|worst fractal dimension|label|
+-----------+-

                                                                                

In [60]:
# merge feature columns into a vector column
features = bc_dataset.feature_names
va = VectorAssembler(inputCols = features, outputCol='features')

va_df = va.transform(df)
va_df = va_df.select(['features', 'label'])
va_df.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[17.99,10.38,122....|    0|
|[20.57,17.77,132....|    0|
|[19.69,21.25,130....|    0|
|[11.42,20.38,77.5...|    0|
|[20.29,14.34,135....|    0|
+--------------------+-----+
only showing top 5 rows



In [61]:
# train-test split
(train, test) = va_df.randomSplit([0.9, 0.1])

### Training and testing

In [62]:
# train
fmc = FMClassifier(labelCol="label", stepSize=0.001)
fmc = fmc.fit(train)

# predict
pred = fmc.transform(test)
pred.show(5) 

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[9.029,17.33,58.7...|    1|[-5.5927500970868...|[0.00371094785825...|       1.0|
|[11.31,19.04,71.8...|    1|[-6.7154146600895...|[0.00121061593776...|       1.0|
|[11.76,21.6,74.72...|    1|[-4.6617818358611...|[0.00936115051921...|       1.0|
|[12.68,23.84,82.6...|    0|[4.39960268592619...|[0.98786680373943...|       0.0|
|[13.03,18.42,82.6...|    1|[-8.7119715090586...|[1.64576327414511...|       1.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [63]:
# evaluate prediction
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
acc = evaluator.evaluate(pred)

print("Prediction Accuracy: ", acc)

y_pred = pred.select("prediction").collect()
y_orig = pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm) 

Prediction Accuracy:  0.9193548387096774
Confusion Matrix:
[[24  3]
 [ 2 33]]


In [64]:
# stop session 
spark.stop()