In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/19 22:38:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Logistic regression with pyspark

## Import data

In [3]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
df.drop('day','month','poutcome').show(5)

+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



###  Deal with categorical data and Convert the data to dense vector

In [5]:
catcols = ['job','marital','education','default','housing','loan','contact','poutcome']
num_cols = ['balance', 'duration','campaign','pdays','previous']
labelCol = 'y'



## Process categorical columns

The following code does three things with pipeline:

* **`StringIndexer`** all categorical columns
* **`OneHotEncoder`** all categorical index columns
* **`VectorAssembler`** all feature columns into one vector column

### Categorical columns

In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [7]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [8]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [9]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [10]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

23/05/19 22:38:55 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+---------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                 |label|
+---------------------------------------------------------------------------------------------------------+-----+
|(29,[8,11,15,16,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1787.0,79.0,1.0,-1.0])                |no   |
|(29,[4,11,13,16,17,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4789.0,220.0,1.0,339.0,4.0])       |no   |
|(29,[0,12,14,16,17,18,19,22,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1350.0,185.0,1.0,330.0,1.0])|no   |
|(29,[0,11,14,16,17,20,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1476.0,199.0,4.0,-1.0])               |no   |
|(29,[1,11,13,16,17,18

### We need to deal with label, which is string, yes or no, need to make them numbers

### Build StringIndexer stages

In [11]:
# Index labels, adding metadata to the label column 
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
data=labelIndexer.transform(data)

In [12]:
data.show(5)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(29,[8,11,15,16,1...|   no|         0.0|
|(29,[4,11,13,16,1...|   no|         0.0|
|(29,[0,12,14,16,1...|   no|         0.0|
|(29,[0,11,14,16,1...|   no|         0.0|
|(29,[1,11,13,16,1...|   no|         0.0|
+--------------------+-----+------------+
only showing top 5 rows



In [13]:
labelIndexer.labels

['no', 'yes']

In [14]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
# Update metadata accordingly.
featureIndexer =VectorIndexer(inputCol="features", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=4).fit(data)


In [15]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+--------------------+
|            features|label|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|(29,[8,11,15,16,1...|   no|         0.0|(29,[8,11,15,16,1...|
|(29,[4,11,13,16,1...|   no|         0.0|(29,[4,11,13,16,1...|
|(29,[0,12,14,16,1...|   no|         0.0|(29,[0,12,14,16,1...|
|(29,[0,11,14,16,1...|   no|         0.0|(29,[0,11,14,16,1...|
|(29,[1,11,13,16,1...|   no|         0.0|(29,[1,11,13,16,1...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



### Split the data to training and test data sets

In [16]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|features                                                                                        |label|indexedLabel|indexedFeatures                                                                                 |
+------------------------------------------------------------------------------------------------+-----+------------+------------------------------------------------------------------------------------------------+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-105.0,60.0,2.0,-1.0])|no   |0.0         |(29,[0,11,13,16,17,18,1

## Build cross-validation model


### Cross-Validation

In general, one round of cross-validation involves partitioning a sample of data into complementary subsets, performing the analysis on one subset (called the training set), and validating the analysis on the other subset (called the validation set or testing set).
CrossValidator begins by splitting the dataset into a set of folds which are used as separate training and test datasets. E.g., with k=3 folds, CrossValidator will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. To evaluate a particular ParamMap, CrossValidator computes the average evaluation metric for the 3 Models produced by fitting the Estimator on the 3 different (training, test) dataset pairs.

After identifying the best ParamMap, CrossValidator finally re-fits the Estimator using the best ParamMap and the entire dataset.  In simple term, pickup the ParamMap that produces the best model and to use that model for subsequent transform().


### Estimator

For decision trees, this parameter is set by default as none, which often is the main reason for the overfitting, as each tree will expand until every leaf is pure. We see the higher the value of the 'maxDepth” parameter is, the stronger the overfitting of the model.

### minimal gain
The gain of a node is calculated before splitting it. The node is split if its gain is greater than the minimal gain. A higher value of minimal gain results in fewer splits and thus smaller trees. A value that is too high will completely prevent splitting and trees with single nodes are generated.

In [17]:
from pyspark.ml.classification import RandomForestClassifier

random_forest = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')

In [18]:
from pyspark.ml.tuning import ParamGridBuilder

param_grid = ParamGridBuilder().\
    addGrid(random_forest.maxDepth, [2, 3, 4]).\
    addGrid(random_forest.minInfoGain, [0.1, 0.2, 0.3]).\
    build()

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel")

In [20]:
from pyspark.ml.tuning import CrossValidator

crossvalidation = CrossValidator(estimator=random_forest, estimatorParamMaps=param_grid, evaluator=evaluator)

In [21]:
crossvalidation_mod = crossvalidation.fit(data)

In [22]:
pred_test = crossvalidation_mod.transform(data)
pred_test.show(5)

+--------------------+-----+------------+--------------------+--------------------+--------------------+----------+
|            features|label|indexedLabel|     indexedFeatures|       rawPrediction|         probability|prediction|
+--------------------+-----+------------+--------------------+--------------------+--------------------+----------+
|(29,[8,11,15,16,1...|   no|         0.0|(29,[8,11,15,16,1...|[17.7161486983916...|[0.88580743491958...|       0.0|
|(29,[4,11,13,16,1...|   no|         0.0|(29,[4,11,13,16,1...|[17.7161486983916...|[0.88580743491958...|       0.0|
|(29,[0,12,14,16,1...|   no|         0.0|(29,[0,12,14,16,1...|[17.7161486983916...|[0.88580743491958...|       0.0|
|(29,[0,11,14,16,1...|   no|         0.0|(29,[0,11,14,16,1...|[17.7161486983916...|[0.88580743491958...|       0.0|
|(29,[1,11,13,16,1...|   no|         0.0|(29,[1,11,13,16,1...|[17.7161486983916...|[0.88580743491958...|       0.0|
+--------------------+-----+------------+--------------------+----------

In [23]:
print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_test))


Accuracy on training data (areaUnderROC):  0.5


In [24]:
label_pred_train = pred_test.select('label', 'prediction')
label_pred_train.rdd.zipWithIndex().countByKey()

                                                                                

defaultdict(int,
            {Row(label='no', prediction=0.0): 4000,
             Row(label='yes', prediction=0.0): 521})

In [25]:
print('max depth: ', crossvalidation_mod.bestModel._java_obj.getMaxDepth(), "\n",
     'min information gain: ', crossvalidation_mod.bestModel._java_obj.getMinInfoGain())


max depth:  2 
 min information gain:  0.1


In [32]:
crossvalidation_mod.bestModel.getMinInfoGain()

0.1

23/05/21 01:22:21 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1799156 ms exceeds timeout 120000 ms
23/05/21 01:22:21 WARN SparkContext: Killing executors is not supported by current scheduler.


In [None]:
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
pred_test=labelConverter.transform(pred_test)

In [None]:
pred_test.show(5)

### BinaryClassificationEvaluator only provide accuracy metrics, you need MulticlassClassificationEvaluator to provide all metrics, MulticlassClassificationEvaluator work with binary classification too

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(pred_test)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))

### Evaluate training model

area under ROC  https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

accuracy

False positive rate by label

True positive rate by label

Precision by label

Recall by label

F-measure by label


### List all available metric names

In [None]:
evaluator.metricName

### Get f1 metric

In [None]:
evaluator.setMetricName('f1')
f1=evaluator.evaluate(pred_test)
print(f'f1 = {f1}')