Some PySpark Supervised Learning Algorithms

In [18]:
# import Libraries here
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LinearSVC, MultilayerPerceptronClassifier, OneVsRest, NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F

import pandas as pd
import numpy as np

In [3]:
filename = r'bank/bank-full.csv'

spark = SparkSession.builder.getOrCreate()
data = spark.read.csv(filename, header=True, inferSchema=True, sep=";")

data.show(5, truncate=False)

+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|job         |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|y  |
+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|58 |management  |married|tertiary |no     |2143   |yes    |no  |unknown|5  |may  |261     |1       |-1   |0       |unknown |no |
|44 |technician  |single |secondary|no     |29     |yes    |no  |unknown|5  |may  |151     |1       |-1   |0       |unknown |no |
|33 |entrepreneur|married|secondary|no     |2      |yes    |yes |unknown|5  |may  |76      |1       |-1   |0       |unknown |no |
|47 |blue-collar |married|unknown  |no     |1506   |yes    |no  |unknown|5  |may  |92      |1       |-1   |0       |unknown |no |
|33 |unknown     |single |unknown  |no     |1      |no     |no  |unknown|5  |may  |198    

In [4]:
# function to assemble individual columns to one column
def assemble_vectors(df, features_list, target_variable_name):
    stages = []
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [assembler]
    # select all columns + target + newly created 'features
    selectedCols = [target_variable_name, 'features'] + features_list
    # use Pipeline to process Sequentially
    pipeline = Pipeline(stages=stages)
    # assembler model
    assembleModel = pipeline.fit(df)
    # apply assembler model on data
    df = assembleModel.transform(df).select(selectedCols)
    return df


Linear Regression

In [5]:
# select the Variables
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])
target_variable_name = 'balance'

# exclude the target varibles and select all other features
features_list = linear_df.columns
features_list.remove(target_variable_name)

# apply the above function on our DataFrame
df = assemble_vectors(linear_df, features_list, target_variable_name)

# fit the regression model
reg = LinearRegression(featuresCol='features', labelCol='balance')
reg_model = reg.fit(df)

# view the coefficients and intercepts for each variable
for k, v in df.schema['features'].metadata['ml_attr']['attrs'].items():
    features_df = pd.DataFrame(v)

# print coefficient and intercept
print(reg_model.coefficients, reg_model.intercept)
features_df['coefficients'] = reg_model.coefficients

# predict results
pred_result = reg_model.transform(df)

# the intercecpt is the last figure outside the brackets

[28.08397290892997,3.3055463619496286,0.24882841970901756,-14.142676297161454,-0.08248810233032043,23.462992800762525] 124.92130092818479


Logistic Regression

In [6]:
# convert the target variable to numeric values
data = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y']).withColumn('y', F.when(data['y'] == 'no', 0).otherwise(1))
# data = data.withColumnRenamed('y', 'y1')
data.groupBy('y').count().show()

+---+-----+
|  y|count|
+---+-----+
|  1| 5289|
|  0|39922|
+---+-----+



In [7]:
target_variable_name = 'y'
logistic_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y'])
features_list = logistic_df.columns
features_list.remove(target_variable_name)

df = assemble_vectors(logistic_df, features_list, target_variable_name)

# We will try both the Binary and multinomial fit using the same target, for testing.

binary_clf = LogisticRegression(featuresCol='features', labelCol='y', family='binomial')
multinomial_clf = LogisticRegression(featuresCol='features', labelCol='y', family='multinomial')

binary_clf_model = binary_clf.fit(df)
multinomial_clf_model = multinomial_clf.fit(df)

np.set_printoptions(precision=3, suppress=True)
print('Binary model coeff: \n', binary_clf_model.coefficients)
print(' ')
np.set_printoptions(precision=4, suppress=True)

print('Multinomial model coeff-matrix: \n', multinomial_clf_model.coefficientMatrix)
print(' ')
print('Binary model Intercept: ', binary_clf_model.intercept)
print('Multinomial model Intercept-vector: ', multinomial_clf_model.interceptVector)

Binary model coeff: 
 [0.007959289994272856,3.7181275569922365e-05,-0.0016500733126505523,0.0036371977017850476,-0.1280432835497321,0.00211357134928098,0.08593801086067758]
 
Multinomial model coeff-matrix: 
 DenseMatrix([[-0.004 , -0.    ,  0.0008, -0.0018,  0.064 , -0.0011, -0.043 ],
             [ 0.004 ,  0.    , -0.0008,  0.0018, -0.064 ,  0.0011,  0.043 ]])
 
Binary model Intercept:  -3.469901065557049
Multinomial model Intercept-vector:  [1.7349520795818267,-1.7349520795818267]


Decision Trees

In [8]:
# using Decision trees in PySpark
clf = DecisionTreeClassifier(featuresCol='features', labelCol='y', impurity='gini')
clf_model = clf.fit(df)

clf2 = DecisionTreeClassifier(featuresCol='features', labelCol='y', impurity='entropy')
clf_model2 = clf2.fit(df)

clf_predictions = clf_model.transform(df)
print('CLF_model: \n', clf_model.featureImportances)
print('CLF_model2: \n', clf_model2.featureImportances)


CLF_model: 
 (7,[0,2,3,4,5],[0.06239787428032645,0.012687037427051219,0.7317420892129068,0.0006485856970014362,0.1925244133827141])
CLF_model2: 
 (7,[0,2,3,4,5],[0.023055989762381938,0.0014458855774714733,0.9219831433075157,0.0006749598581260977,0.05284002149450472])


In [9]:
# Decision Tree Regressor

reg = DecisionTreeRegressor(featuresCol='features', labelCol='balance', impurity='variance')
reg_model = reg.fit(df)
reg_model_predictions = reg_model.transform(df)
print(reg_model.featureImportances)

(7,[0,1,2,3,4,6],[0.010801553129947131,0.9539639121009177,0.006382944598051706,0.014119130276031608,0.0026773820495634417,0.012055077845488276])


In [50]:
# shows the Decision making process, you can visualize using GraphViz
# clf_model.toDebugString
reg_model.toDebugString

'DecisionTreeRegressionModel: uid=DecisionTreeRegressor_8f0c71b415cc, depth=5, numNodes=53, numFeatures=7\n  If (feature 1 <= 7535.5)\n   If (feature 1 <= 1972.5)\n    If (feature 1 <= 679.5)\n     If (feature 1 <= 185.5)\n      If (feature 1 <= -318.5)\n       Predict: -634.4687045123726\n      Else (feature 1 > -318.5)\n       Predict: 22.46003274254395\n     Else (feature 1 > 185.5)\n      If (feature 1 <= 441.5)\n       Predict: 304.5719965918773\n      Else (feature 1 > 441.5)\n       Predict: 553.7191844300278\n    Else (feature 1 > 679.5)\n     If (feature 1 <= 1233.0)\n      If (feature 1 <= 902.5)\n       Predict: 784.483606557377\n      Else (feature 1 > 902.5)\n       Predict: 1058.079779917469\n     Else (feature 1 > 1233.0)\n      If (feature 1 <= 1680.0)\n       Predict: 1438.287089871612\n      Else (feature 1 > 1680.0)\n       Predict: 1819.8941267387945\n   Else (feature 1 > 1972.5)\n    If (feature 1 <= 3698.0)\n     If (feature 1 <= 2911.5)\n      If (feature 1 <= 23

Random Forests Classifier

In [10]:
# install 'GrpahViz' extension on VScode to view the graph
clf = RandomForestClassifier(featuresCol='features', labelCol='y')
clf_model = clf.fit(df)
print(clf_model.featureImportances)
print('-----------')
print(clf_model.toDebugString)


(7,[0,1,2,3,4,5,6],[0.07542788252752147,0.014268955724185812,0.014950726869614702,0.7167550064459548,0.006244545539471518,0.1260058581877463,0.04634702470550536])
-----------
RandomForestClassificationModel: uid=RandomForestClassifier_2a3d9469ab3c, numTrees=20, numClasses=2, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 485.5)
     If (feature 6 <= 0.5)
      If (feature 0 <= 60.5)
       Predict: 0.0
      Else (feature 0 > 60.5)
       If (feature 3 <= 191.5)
        Predict: 0.0
       Else (feature 3 > 191.5)
        If (feature 3 <= 349.5)
         Predict: 0.0
        Else (feature 3 > 349.5)
         Predict: 1.0
     Else (feature 6 > 0.5)
      If (feature 3 <= 157.5)
       If (feature 4 <= 2.5)
        Predict: 0.0
       Else (feature 4 > 2.5)
        If (feature 5 <= 1.5)
         Predict: 1.0
        Else (feature 5 > 1.5)
         Predict: 0.0
      Else (feature 3 > 157.5)
       If (feature 0 <= 60.5)
        Predict: 0.0
       Else (feature 0 > 60.5)
    

In [55]:
# random forest regressor

reg = RandomForestRegressor(featuresCol='features', labelCol='y')
reg_model = reg.fit(df)
print(reg_model.featureImportances)
print('-------------')
print(reg_model.toDebugString)


(7,[0,1,2,3,4,5,6],[0.07481314433373935,0.017878100623244072,0.02923655024432077,0.6854700912357911,0.008142108767077747,0.13253423237144418,0.05192577242438292])
-------------
RandomForestRegressionModel: uid=RandomForestRegressor_2c3320c65304, numTrees=20, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 485.5)
     If (feature 5 <= 16.0)
      If (feature 3 <= 204.5)
       If (feature 1 <= 70.5)
        If (feature 2 <= 1.5)
         Predict: 0.05128205128205128
        Else (feature 2 > 1.5)
         Predict: 0.009237875288683603
       Else (feature 1 > 70.5)
        If (feature 1 <= 513.5)
         Predict: 0.022822621245175364
        Else (feature 1 > 513.5)
         Predict: 0.0313533112935907
      Else (feature 3 > 204.5)
       If (feature 3 <= 345.5)
        If (feature 0 <= 60.5)
         Predict: 0.06378840917930766
        Else (feature 0 > 60.5)
         Predict: 0.45918367346938777
       Else (feature 3 > 345.5)
        If (feature 1 <= 1703.0)
         Pre

Gradient Boosting

--useful for modelling imbalanced target classes

In [57]:
clf = GBTClassifier(featuresCol='features', labelCol='y')
clf_model = clf.fit(df)
print(clf_model.featureImportances)
print('----------')
print(clf_model.toDebugString)


(7,[0,1,2,3,4,5,6],[0.11456070318724074,0.08612537860796485,0.14497648796257884,0.44608170285448306,0.04715635871428555,0.14113403567138982,0.01996533300205716])
----------
GBTClassificationModel: uid = GBTClassifier_68cd4bcfe835, numTrees=20, numClasses=2, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 472.5)
     If (feature 5 <= 16.0)
      If (feature 0 <= 59.5)
       If (feature 3 <= 204.5)
        If (feature 0 <= 28.5)
         Predict: -0.8707360861759426
        Else (feature 0 > 28.5)
         Predict: -0.9660419670030435
       Else (feature 3 > 204.5)
        If (feature 0 <= 25.5)
         Predict: -0.5157894736842106
        Else (feature 0 > 25.5)
         Predict: -0.8532676899314605
      Else (feature 0 > 59.5)
       If (feature 3 <= 204.5)
        If (feature 3 <= 127.5)
         Predict: -0.9193083573487032
        Else (feature 3 > 127.5)
         Predict: -0.5804195804195804
       Else (feature 3 > 204.5)
        If (feature 1 <= 235.5)
         Pred

In [58]:
reg = GBTRegressor(featuresCol='features', labelCol='y')
reg_model = reg.fit(df)
print(reg_model.featureImportances)
print(reg_model.toDebugString)


(7,[0,1,2,3,4,5,6],[0.12129057842481654,0.06364388248368288,0.15266455528021788,0.3666261341727334,0.039382809229594314,0.22790813230279752,0.028483908106157578])
GBTRegressionModel: uid=GBTRegressor_ae2ec8380a06, numTrees=20, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 511.5)
     If (feature 5 <= 9.5)
      If (feature 0 <= 60.5)
       If (feature 3 <= 226.5)
        If (feature 0 <= 28.5)
         Predict: 0.07244785949506037
        Else (feature 0 > 28.5)
         Predict: 0.018836699926989536
       Else (feature 3 > 226.5)
        If (feature 0 <= 25.5)
         Predict: 0.24788732394366197
        Else (feature 0 > 25.5)
         Predict: 0.08533870454786388
      Else (feature 0 > 60.5)
       If (feature 3 <= 127.5)
        If (feature 2 <= 9.5)
         Predict: 0.0
        Else (feature 2 > 9.5)
         Predict: 0.10344827586206896
       Else (feature 3 > 127.5)
        If (feature 3 <= 226.5)
         Predict: 0.35555555555555557
        Else (feature 3 > 

Support Vector Machines - SVM

In [60]:
clf = LinearSVC(featuresCol='features', labelCol='y')
clf_model = clf.fit(df)
print(clf_model.intercept)
print('----------------')
print(clf_model.coefficients)


-1.0149273280183646
----------------
[0.00018771260632926062,5.257883051529256e-09,-7.948222678092276e-05,2.0908016936219175e-05,-0.0006652180215710573,3.426437268909148e-06,0.0003418718600290087]


MultiLayer Perceptron

In [None]:

# you should use TensorFlow for Deep Learning
# layers end with '2' cause this is a Binary classification problem
clf = MultilayerPerceptronClassifier(featuresCol='features', labelCol='y', layers=[4,4,2])
clf_model = clf.fit(df)


One-vs_Rest Classiier

In [None]:

# 'EDUCATION' column has 4 categories, and would be a good example
target_variable_name = 'education'
multiclass_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'education'])

features_list = multiclass_df.columns
features_list.remove(target_variable_name)

multiclass_df = assemble_vectors(multiclass_df, features_list,target_variable_name)

# fitting the OvR classifier
(train, test) = multiclass_df.randomSplit([0.7, 0.3])
# instantiate the base classifier
clf = RandomForestClassifier(featuresCol='features', labelCol=target_variable_name)
# instantiate the OvR classifier
ovr = OneVsRest(classifier=clf, featuresCol='features', labelCol=target_variable_name)
ovrModel = ovr.fit(train)

# score the model on the test data
predictions = ovrModel.transform(test)
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='education')
accuracy = evaluator.evaluate(predictions)
# print("Test Error = %g" % (1.0 - accuracy))


Naive Bayes Classifier

In [None]:
target_variable_name = 'y'
nonneg_df = data.select(['age', 'day', 'duration', 'campaign', 'previous', 'y'])
features_list = nonneg_df.columns
features_list.remove(target_variable_name)
nonneg_df = assemble_vectors(nonneg_df, features_list, target_variable_name)

clf = NaiveBayes(featuresCol='features', labelCol='y')
clf_model = clf.fit(nonneg_df)
