# SPARK MACHINE LEARNING LIBRARY  
ELIF CANSU YILDIZ

In [1]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession\
 .builder\
 .appName("MLLib-Algorithms")\
 .getOrCreate()

__EXAMPLE DATA SET FROM KAGGLE:__

In [3]:
df = spark.read.csv('bank.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [4]:
#numeric_features = [t[0] for t in df.dtypes if t[1] == 'double']
categorical_features = [t[0] for t in df.dtypes if t[1]=='string']
print("categorical col:", categorical_features)
df.select(categorical_features).show(5)

numeric_features = [t[0] for t in df.dtypes if t[1]=='int']
df.select(numeric_features).show(5)
#df[numeric_features].show()

categorical col: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
+----------+-------+---------+-------+-------+----+-------+-----+--------+-------+
|       job|marital|education|default|housing|loan|contact|month|poutcome|deposit|
+----------+-------+---------+-------+-------+----+-------+-----+--------+-------+
|    admin.|married|secondary|     no|    yes|  no|unknown|  may| unknown|    yes|
|    admin.|married|secondary|     no|     no|  no|unknown|  may| unknown|    yes|
|technician|married|secondary|     no|    yes|  no|unknown|  may| unknown|    yes|
|  services|married|secondary|     no|    yes|  no|unknown|  may| unknown|    yes|
|    admin.|married| tertiary|     no|     no|  no|unknown|  may| unknown|    yes|
+----------+-------+---------+-------+-------+----+-------+-----+--------+-------+
only showing top 5 rows

+---+-------+---+--------+--------+-----+--------+
|age|balance|day|duration|campaign|pdays|previous|
+---

__HOW MANY DISTINCT VALUE DO COLUMNS HAVE?__

In [5]:
from pyspark.sql.functions import col, countDistinct

df.agg(*(countDistinct(col(c)).alias(c) for c in numeric_features)).show()
df.agg(*(countDistinct(col(c)).alias(c) for c in categorical_features)).show()

+---+-------+---+--------+--------+-----+--------+
|age|balance|day|duration|campaign|pdays|previous|
+---+-------+---+--------+--------+-----+--------+
| 76|   3805| 31|    1428|      36|  472|      34|
+---+-------+---+--------+--------+-----+--------+

+---+-------+---------+-------+-------+----+-------+-----+--------+-------+
|job|marital|education|default|housing|loan|contact|month|poutcome|deposit|
+---+-------+---------+-------+-------+----+-------+-----+--------+-------+
| 12|      3|        4|      2|      2|   2|      3|   12|       4|      2|
+---+-------+---------+-------+-------+----+-------+-----+--------+-------+



# -------------------------------- CLASSIFICATION ----------------------------------------

## LOGISTIC REGRESSION

In [6]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = df.randomSplit([0.7, 0.3])
print("total data count: ", df.count())
print("train data count: ", trainingData.count())
print("test data count: ", testData.count())

total data count:  11162
train data count:  7800
test data count:  3362


__TRAINING__

NOTE: __fit()__ should be used with __StringIndexer()__ function

In [7]:
labelIndexer = StringIndexer(inputCol = 'housing', outputCol='indexedLabel',
                             handleInvalid="error", stringOrderType="frequencyDesc")\
                            .fit(trainingData)

assembler = VectorAssembler(inputCols = numeric_features, outputCol='features')

lr = LogisticRegression(featuresCol='features', labelCol="indexedLabel",
                        maxIter=10, regParam=0.3, elasticNetParam=0.8)

labelConverter = IndexToString(inputCol= "prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

stages = [assembler,labelIndexer, lr, labelConverter]
partialPipeline = Pipeline().setStages(stages)
model = partialPipeline.fit(trainingData)

__MAKE PREDICTIONS__

In [8]:
predictions = model.transform(testData)
predictions = predictions.select("features", "indexedLabel", "prediction", "predictedLabel",
                                 col("housing").alias("label"))
predictions.show(5, truncate=False)

+-------------------------------------+------------+----------+--------------+-----+
|features                             |indexedLabel|prediction|predictedLabel|label|
+-------------------------------------+------------+----------+--------------+-----+
|[19.0,108.0,9.0,273.0,2.0,182.0,1.0] |0.0         |0.0       |no            |no   |
|[20.0,292.0,5.0,385.0,2.0,93.0,1.0]  |0.0         |0.0       |no            |no   |
|[20.0,0.0,1.0,143.0,5.0,91.0,8.0]    |0.0         |0.0       |no            |no   |
|[20.0,6991.0,12.0,178.0,2.0,-1.0,0.0]|0.0         |0.0       |no            |no   |
|[20.0,130.0,4.0,75.0,3.0,-1.0,0.0]   |0.0         |0.0       |no            |no   |
+-------------------------------------+------------+----------+--------------+-----+
only showing top 5 rows



__EVALUATION__

In [9]:
evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="prediction", metricName="areaUnderROC")
areaUnderROC = evaluator.evaluate(predictions)
print("Area under ROC = %g" % areaUnderROC)

evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="prediction", metricName="areaUnderPR")
areaUnderPR = evaluator.evaluate(predictions)
print("areaUnderPR = %g" % areaUnderPR)

Area under ROC = 0.5
areaUnderPR = 0.463415


## -------------------------------------------------------------------------------------

## RANDOM FOREST CLASSIFIER

In [10]:
data = df
labelIndexer = StringIndexer(inputCol = 'housing', outputCol='indexedLabel',
                             handleInvalid="error", stringOrderType="frequencyDesc").fit(data)

(trainingData, testData) = df.randomSplit([0.7, 0.3])

assembler = VectorAssembler(inputCols = numeric_features, outputCol='features')

rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)

labelConverter = IndexToString(inputCol= "prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

stages = [assembler,labelIndexer, rf, labelConverter]
partialPipeline = Pipeline().setStages(stages)
model = partialPipeline.fit(trainingData)

In [13]:
predictions = model.transform(testData)
predictions = predictions.select("features", "indexedLabel", "prediction", "predictedLabel",
                                 col("housing").alias("label"))
predictions.show(5)

+--------------------+------------+----------+--------------+-----+
|            features|indexedLabel|prediction|predictedLabel|label|
+--------------------+------------+----------+--------------+-----+
|[18.0,608.0,13.0,...|         0.0|       0.0|            no|   no|
|[18.0,108.0,9.0,9...|         0.0|       0.0|            no|   no|
|[19.0,608.0,12.0,...|         0.0|       0.0|            no|   no|
|[19.0,779.0,1.0,1...|         0.0|       0.0|            no|   no|
|[20.0,336.0,5.0,1...|         0.0|       0.0|            no|   no|
+--------------------+------------+----------+--------------+-----+
only showing top 5 rows



In [14]:
evaluator2 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator2.evaluate(predictions)
print("accuracy = %g" % accuracy)

rfModel = model.stages[2]
print(rfModel)  # summary only

accuracy = 0.626502
RandomForestClassificationModel (uid=RandomForestClassifier_b1960f89c5b3) with 10 trees


## GRADIENT-BOOSTED TREE CLASSIFIER

In [15]:
data = df
labelIndexer = StringIndexer(inputCol = 'housing', outputCol='indexedLabel',
                             handleInvalid="error", stringOrderType="frequencyDesc").fit(data)

(trainingData, testData) = df.randomSplit([0.7, 0.3])

assembler = VectorAssembler(inputCols = numeric_features, outputCol='features')

gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="features", maxIter=10)

labelConverter = IndexToString(inputCol= "prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

stages = [assembler,labelIndexer, gbt, labelConverter]
partialPipeline = Pipeline().setStages(stages)
model = partialPipeline.fit(trainingData)

In [16]:
predictions = model.transform(testData)
predictions = predictions.select("features", "indexedLabel", "prediction", "predictedLabel",
                                 col("housing").alias("label"))
predictions.show(5)

+--------------------+------------+----------+--------------+-----+
|            features|indexedLabel|prediction|predictedLabel|label|
+--------------------+------------+----------+--------------+-----+
|[18.0,608.0,12.0,...|         0.0|       0.0|            no|   no|
|[18.0,5.0,24.0,14...|         0.0|       0.0|            no|   no|
|[18.0,108.0,8.0,1...|         0.0|       0.0|            no|   no|
|[19.0,103.0,10.0,...|         0.0|       0.0|            no|   no|
|[19.0,302.0,16.0,...|         0.0|       0.0|            no|   no|
+--------------------+------------+----------+--------------+-----+
only showing top 5 rows



In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy = %g" % accuracy)

gbtModel = model.stages[2]
print(gbtModel)  # summary only

accuracy = 0.641452
GBTClassificationModel (uid=GBTClassifier_d48295992724) with 10 trees


## DECISION TREE CLASSIFIER

In [18]:
data = df
labelIndexer = StringIndexer(inputCol = 'housing', outputCol='indexedLabel',
                             handleInvalid="error", stringOrderType="frequencyDesc").fit(data)

(trainingData, testData) = df.randomSplit([0.7, 0.3])

assembler = VectorAssembler(inputCols = numeric_features, outputCol='features')

dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")

labelConverter = IndexToString(inputCol= "prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

stages = [assembler,labelIndexer, dt, labelConverter]
partialPipeline = Pipeline().setStages(stages)
treeModel = partialPipeline.fit(trainingData)

In [19]:
predictions = treeModel.transform(testData)
predictions = predictions.select("features", "indexedLabel", "prediction", "predictedLabel",
                                 col("housing").alias("label"))
predictions.show(5)

+--------------------+------------+----------+--------------+-----+
|            features|indexedLabel|prediction|predictedLabel|label|
+--------------------+------------+----------+--------------+-----+
|[18.0,5.0,24.0,14...|         0.0|       0.0|            no|   no|
|[19.0,55.0,6.0,89...|         0.0|       0.0|            no|   no|
|[19.0,302.0,16.0,...|         0.0|       0.0|            no|   no|
|[19.0,108.0,9.0,2...|         0.0|       0.0|            no|   no|
|[19.0,779.0,1.0,1...|         0.0|       0.0|            no|   no|
+--------------------+------------+----------+--------------+-----+
only showing top 5 rows



In [20]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy = %g" % accuracy)

dtModel = treeModel.stages[2]
print(dtModel)  # summary only

accuracy = 0.588875
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_009cad06afd3) of depth 5 with 27 nodes


## MULTI LAYER PERCEPTRON CLASSIFIER

In [21]:
data = df

In [22]:
labelIndexer = StringIndexer(inputCol='housing', outputCol="indexedLabel").fit(data)

splits = data.randomSplit([0.6,0.4], 1234)
train = splits[0]
test = splits[1]

Specify layers for the neural network:  
Input layer of size 7 (features), two intermediate of size 5 and 4  
And output of size 2 (classes)

In [23]:
assembler = VectorAssembler(inputCols=numeric_features, outputCol="features")

layers = [7, 5, 4, 2]
trainer = MultilayerPerceptronClassifier(maxIter= 500, layers=layers, blockSize=128, seed=1234,
                                         featuresCol='features' , labelCol = "indexedLabel")

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

partialPipeline = Pipeline(stages=[labelIndexer, assembler, trainer, labelConverter])
model = partialPipeline.fit(train)

In [24]:
predictions = model.transform(test)
predictions = predictions.select("features", "indexedLabel", "prediction", "predictedLabel",
                                 col("housing").alias("label"))
predictions.show(5)

+--------------------+------------+----------+--------------+-----+
|            features|indexedLabel|prediction|predictedLabel|label|
+--------------------+------------+----------+--------------+-----+
|[18.0,608.0,12.0,...|         0.0|       0.0|            no|   no|
|[18.0,608.0,13.0,...|         0.0|       0.0|            no|   no|
|[18.0,5.0,24.0,14...|         0.0|       0.0|            no|   no|
|[18.0,348.0,5.0,4...|         0.0|       0.0|            no|   no|
|[19.0,103.0,10.0,...|         0.0|       0.0|            no|   no|
+--------------------+------------+----------+--------------+-----+
only showing top 5 rows



In [25]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy = %g" % accuracy)

perceptronModel = model.stages[2]
print(perceptronModel)  # summary only

accuracy = 0.561494
MultilayerPerceptronClassifier_e512e4399148


## -------------------------------------------------------------------------------------------------------------------

__DATA SHOW WITH SOME FUNCTIONS__

In [27]:
df.count()
len(df.columns)
data[data.columns[:5]].show(5)
data.describe(['duration']).show(5)

data[['marital']].show(5)

columnss = data.columns[:2] + data.columns[3:]
print(columnss)

label = ['marital']
features = data.select([column for column in data.columns if column not in label])
print(features.columns)

+---+----------+-------+---------+-------+
|age|       job|marital|education|default|
+---+----------+-------+---------+-------+
| 59|    admin.|married|secondary|     no|
| 56|    admin.|married|secondary|     no|
| 41|technician|married|secondary|     no|
| 55|  services|married|secondary|     no|
| 54|    admin.|married| tertiary|     no|
+---+----------+-------+---------+-------+
only showing top 5 rows

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             11162|
|   mean|371.99381831213043|
| stddev|347.12838571630687|
|    min|                 2|
|    max|              3881|
+-------+------------------+

+-------+
|marital|
+-------+
|married|
|married|
|married|
|married|
|married|
+-------+
only showing top 5 rows

['age', 'job', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit']
['age', 'job', 'education', 'default', 'balance

# ---------------------------------CLUSTERING--------------------------------------

## K-MEANS

In [28]:
data = df

assembler = VectorAssembler(inputCols=numeric_features, outputCol="features")

partialPipeline = Pipeline(stages=[assembler])

model = partialPipeline.fit(data)

newdata = model.transform(data)
newdata[newdata.columns[-5:]].show(5, truncate=False)

+-----+--------+--------+-------+-------------------------------------+
|pdays|previous|poutcome|deposit|features                             |
+-----+--------+--------+-------+-------------------------------------+
|-1   |0       |unknown |yes    |[59.0,2343.0,5.0,1042.0,1.0,-1.0,0.0]|
|-1   |0       |unknown |yes    |[56.0,45.0,5.0,1467.0,1.0,-1.0,0.0]  |
|-1   |0       |unknown |yes    |[41.0,1270.0,5.0,1389.0,1.0,-1.0,0.0]|
|-1   |0       |unknown |yes    |[55.0,2476.0,5.0,579.0,1.0,-1.0,0.0] |
|-1   |0       |unknown |yes    |[54.0,184.0,5.0,673.0,2.0,-1.0,0.0]  |
+-----+--------+--------+-------+-------------------------------------+
only showing top 5 rows



In [29]:
def kmeansModel(df, k=3 , inputCol="features"):
    
    kmeans = KMeans(featuresCol=inputCol, k=k, seed=1, distanceMeasure="euclidean")

    model = kmeans.fit(df)

    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(df)
    print("--------------------Within Set Sum of Squared Errors = " + str(wssse))

    # Shows the result.
    """centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)"""

In [30]:
for i in range(3,10):
    print("i = ", i, "----------------")
    kmeansModel(newdata, i, "features")

i =  3 ----------------
--------------------Within Set Sum of Squared Errors = 31776910042.39926
i =  4 ----------------
--------------------Within Set Sum of Squared Errors = 20961544898.84958
i =  5 ----------------
--------------------Within Set Sum of Squared Errors = 12491087263.526875
i =  6 ----------------
--------------------Within Set Sum of Squared Errors = 9388949070.739447
i =  7 ----------------
--------------------Within Set Sum of Squared Errors = 7716547135.553156
i =  8 ----------------
--------------------Within Set Sum of Squared Errors = 6705593727.66295
i =  9 ----------------
--------------------Within Set Sum of Squared Errors = 6137406183.470572


# SOME EXAMPLES USING SPARK DATASETS

## LOGISTIC REGRESSION

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Load training data
training = spark.read.format("libsvm").load("/home/ubuntu/spark-2.4.0-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

training.printSchema()
#training.show(2, truncate=False)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for logistic regression
print("\nCoefficients: " + str(lrModel.coefficients))
print("\nIntercept: " + str(lrModel.intercept))

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(training)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("\nMultinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("\nMultinomial intercepts: " + str(mlrModel.interceptVector))

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("\nobjectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

## K-MEANS

In [32]:
from pyspark.ml.clustering import KMeans

# Loads data.
dataset = spark.read.format("libsvm").load("/home/ubuntu/spark-2.4.0-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt")
dataset.show(truncate=False)

# Trains a k-means model.
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(dataset)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("\nCluster Centers: ")
for center in centers:
    print(center)
    


+-----+-------------------------+
|label|features                 |
+-----+-------------------------+
|0.0  |(3,[],[])                |
|1.0  |(3,[0,1,2],[0.1,0.1,0.1])|
|2.0  |(3,[0,1,2],[0.2,0.2,0.2])|
|3.0  |(3,[0,1,2],[9.0,9.0,9.0])|
|4.0  |(3,[0,1,2],[9.1,9.1,9.1])|
|5.0  |(3,[0,1,2],[9.2,9.2,9.2])|
+-----+-------------------------+

Within Set Sum of Squared Errors = 0.07499999999994544

Cluster Centers: 
[9.1 9.1 9.1]
[0.05 0.05 0.05]
[0.2 0.2 0.2]
