In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
#This is one time on your PC; you may need to run it everytime on colab and databrick notebooks
!pip install -U scikit-learn

In [2]:
from pyspark.sql import SQLContext
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix

In [3]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/16 22:32:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/16 22:32:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/04/16 22:32:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Linear Support Vector Machine with pyspark¶

Import data

In [4]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
df.drop('day','month','poutcome').show(5)

+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [5]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



### Deal with categorical data and Convert the data to dense vector¶

In [6]:
catcols = ['job','marital','education','default','housing','loan','contact','poutcome']
num_cols = ['balance', 'duration','campaign','pdays','previous']
labelCol = 'y'


### Process categorical columns

The following code does three things with pipeline:

StringIndexer all categorical columns

OneHotEncoder all categorical index columns

VectorAssembler all feature columns into one vector column

Categorical columns

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [8]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [9]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [10]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [11]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

23/04/16 22:33:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+---------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                 |label|
+---------------------------------------------------------------------------------------------------------+-----+
|(29,[8,11,15,16,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1787.0,79.0,1.0,-1.0])                |no   |
|(29,[4,11,13,16,17,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4789.0,220.0,1.0,339.0,4.0])       |no   |
|(29,[0,12,14,16,17,18,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1350.0,185.0,1.0,330.0,1.0])|no   |
|(29,[0,11,14,16,17,20,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1476.0,199.0,4.0,-1.0])               |no   |
|(29,[1,11,13,16,17,18

### We need to deal with label, which is string, yes or no, need to make them numbers

Build StringIndexer stages

In [12]:
# Index labels, adding metadata to the label column 
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
data=labelIndexer.transform(data)

In [13]:
data.show(5)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(29,[8,11,15,16,1...|   no|         0.0|
|(29,[4,11,13,16,1...|   no|         0.0|
|(29,[0,12,14,16,1...|   no|         0.0|
|(29,[0,11,14,16,1...|   no|         0.0|
|(29,[1,11,13,16,1...|   no|         0.0|
+--------------------+-----+------------+
only showing top 5 rows



In [14]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
featureIndexer =VectorIndexer(inputCol="features", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=3).fit(data)


In [15]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+--------------------+
|            features|label|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|(29,[8,11,15,16,1...|   no|         0.0|(29,[8,11,15,16,1...|
|(29,[4,11,13,16,1...|   no|         0.0|(29,[4,11,13,16,1...|
|(29,[0,12,14,16,1...|   no|         0.0|(29,[0,12,14,16,1...|
|(29,[0,11,14,16,1...|   no|         0.0|(29,[0,11,14,16,1...|
|(29,[1,11,13,16,1...|   no|         0.0|(29,[1,11,13,16,1...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



### Split the data to training and test data sets¶

In [16]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|features                                                                                        |label|indexedLabel|indexedFeatures                                                                                 |
+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.0,104.0,3.0,-1.0]) |no   |0.0         |(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.0,104.0,3.0,-1.0]) |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,117.0,635.0,1.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,1

### Build cross-validation model¶

In [22]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(featuresCol="indexedFeatures", labelCol="indexedLabel", maxIter=50)

In [26]:
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)


In [27]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[lsvc,labelConverter])
# Train model.  This also runs the indexers.
lsvcModel = pipeline.fit(trainingData)

### Make predictions

In [28]:
# Make predictions.
predictions = lsvcModel.transform(testData)
# Select example rows to display. 
predictions.show(5)

+--------------------+-----+------------+--------------------+--------------------+----------+--------------+
|            features|label|indexedLabel|     indexedFeatures|       rawPrediction|prediction|predictedLabel|
+--------------------+-----+------------+--------------------+--------------------+----------+--------------+
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.51371088455641...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.51841371999673...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[0.62331574570784...|       0.0|            no|
|(29,[0,11,13,16,1...|   no|         0.0|(29,[0,11,13,16,1...|[1.35275732000305...|       0.0|            no|
|(29,[0,11,13,16,1...|  yes|         1.0|(29,[0,11,13,16,1...|[0.31426261086845...|       0.0|            no|
+--------------------+-----+------------+--------------------+--------------------+----------+--------------+
only showi

### Evaluation

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.9058370044052864
Test Error = 0.094163


### Get confusion matrix

In [32]:
y_pred=predictions.select("prediction").collect()
y_orig=predictions.select("indexedLabel").collect()


In [33]:
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm) 

Confusion Matrix:
[[1596   25]
 [ 146   49]]


### Tear down machine learning pipeline

In [34]:
# Stop session 
sc.stop()  
