In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.sql import functions as F 
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import IntegerType, StructType, StructField

import numpy as np

In [2]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv('bank/bank-full.csv', inferSchema=True, header=True, sep=';')
df.show(5, truncate=False)

+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|job         |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|y  |
+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|58 |management  |married|tertiary |no     |2143   |yes    |no  |unknown|5  |may  |261     |1       |-1   |0       |unknown |no |
|44 |technician  |single |secondary|no     |29     |yes    |no  |unknown|5  |may  |151     |1       |-1   |0       |unknown |no |
|33 |entrepreneur|married|secondary|no     |2      |yes    |yes |unknown|5  |may  |76      |1       |-1   |0       |unknown |no |
|47 |blue-collar |married|unknown  |no     |1506   |yes    |no  |unknown|5  |may  |92      |1       |-1   |0       |unknown |no |
|33 |unknown     |single |unknown  |no     |1      |no     |no  |unknown|5  |may  |198    

Performing Stratified Sampling in PySpark

In [3]:
# Option 1
train, test = df.randomSplit([0.7, 0.3], seed=12345)

# for Holdout
train, test, holdout = df.randomSplit([0.7, 0.2, 0.1], seed=12345)

In [None]:
# Option 2
# this option assumes you used *VectorAssembler" and there's a column named FEATURES

# model initialization
lr = LogisticRegression(maxIter=10, featuresCol='features', labelCol='label')

# model parameters to try, this is GridSearchCV for PySpark
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).addGrid(lr.elasticNetParam,
                        [0.0, 0.5, 1.0]).build()

# 70% would be used for training, 30% for evaluation
train_valid_clf = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, 
                            evaluator=BinaryClassificationEvaluator(), trainRatio=0.7)

model = train_valid_clf.fit(df)


In [None]:
# Option 3

# split data for 0s and 1s
zero_df = df.filter(df['label'] == 0)
one_df = df.filter(df['label'] == 1)

# split data into train & test
train_zero, test_zero = zero_df.randomSplit([0.7, 0.3], seed=12345)
train_one, test_one = one_df.randomSplit([0.7, 0.3], seed=12345)

# unionize datasets
train = train_zero.union(train_one)
test = test_zero.union(test_one)


K-FOLD in Pyspark

In [None]:
lr = LogisticRegression(maxIter=10, featuresCol='features', labelCol='label')
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).addGrid(
    lr.elasticNetParam, [0.0, 0.5, 1.0]).build()

# number of folds = 3
crossval_clf = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, 
                    evaluator=BinaryClassificationEvaluator(), numFolds=3)

# fit the data
model = crossval_clf.fit(df)


Leave-one-group out CV

In [7]:

df = df.withColumn('label', F.when(F.col('y')=='yes', 1).otherwise(0))
df = df.drop('y')
df = df.select(['education', 'age', 'balance', 'day', 'duration',
                 'campaign', 'pdays', 'previous', 'label'])

features_list = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# function to assemble cols togetger
def assemble_vectors(df, features_list, target, group_variable):
    stages = []
    ve = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [ve]
    final_cols = [group_variable, target, 'features']
    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(df)
    df = model.transform(df).select(final_cols)
    return df
    

joined_df = assemble_vectors(df, features_list, 'label', 'education')
groups = list(joined_df.select('education').toPandas()['education'].unique())

def leave_one_group_out_validator(df, var_name, groups):
    train_metric_score, test_metric_score = [], []
    for i in groups:
        train = df.filter(df[var_name] != i)
        test = df.filter(df[var_name] != i)
        # model initialization
        lr = LogisticRegression(maxIter=10, featuresCol='features', labelCol='label')
        evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='rawPrediction',
                                            metricName='areaUnderROC')
        # fit model
        lrModel = lr.fit(train)
        # make predictions
        predict_train = lrModel.transform(train)
        predict_test = lrModel.transform(test)
        train_metric_score.append(evaluator.evaluate(predict_train))
        test_metric_score.append(evaluator.evaluate(predict_test))
        print(str(i) + "Group evluation")
        print(" Train AUC - ", train_metric_score[-1])
        print(" Test ROC - ", test_metric_score[-1])
    print('Final evaluation for model')
    print('Train ROC', np.mean(train_metric_score))
    print('Test ROC', np.mean(test_metric_score))        



Model assessment for Continous Targets

In [4]:
# creating ASSEMBLE_VECTORS function

def assemble_vectors(df, features_list, target_variable_name):
    stages = []
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [assembler]
    selectedCols = [target_variable_name, 'features'] + features_list
    pipeline = Pipeline(stages=stages)
    assembleModel = pipeline.fit(df)
    df = assembleModel.transform(df).select(selectedCols)
    return df

In [None]:
cSchema = StructType([StructField('x1', IntegerType()), StructField('x2', IntegerType()), StructField('y', IntegerType())])
df_list = [[58, 50, 12], [37, 95, 27], [29, 137, 39], [19, 150, 45]]

df = spark.createDataFrame(df_list, schema=cSchema)
assembled_df = assemble_vectors(df, ['x1', 'x2'], 'y')

reg = LinearRegression(featuresCol='features', labelCol='y')
reg_model = reg.fit(assembled_df)

print(reg_model.coefficients[0], reg_model.intercept)

pred_result = reg_model.transform(assembled_df)
reg_summary = reg_model.summary

In [None]:
# use Regression Evaluator with MSE

# create a dummy prediction results for testing
pred_result = ''

# just change the METRIC-NAME parameters to try out different metrics
evaluator = RegressionEvaluator(labelCol='y', predictionCol='prediction', metricName='mse')
evaluator.evaluate(pred_result)

