In [1]:
stages = [] # stages in our Pipeline

dataset = spark.table('bank_full')

columns = dataset.columns

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]

for categoricalColumn in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalColumn, outputCol=categoricalColumn + "index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalColumn + "class_vec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

In [3]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="y", outputCol="label")
stages += [label_stringIdx]

In [4]:
# Transform all features into a vector using VectorAssembler
numericColumns = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]

assemblerInputs = [c + "class_vec" for c in categoricalColumns] + numericColumns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [5]:
from pyspark.ml.classification import LogisticRegression

partialPipeline = Pipeline().setStages(stages)

pipelineModel = partialPipeline.fit(dataset)

preppedDataDF = pipelineModel.transform(dataset)

In [6]:
display(preppedDataDF)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,jobindex,jobclass_vec,maritalindex,maritalclass_vec,educationindex,educationclass_vec,defaultindex,defaultclass_vec,housingindex,housingclass_vec,loanindex,loanclass_vec,contactindex,contactclass_vec,monthindex,monthclass_vec,poutcomeindex,poutcomeclass_vec,label,features
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,1.0,"List(0, 11, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(1, 11, 14, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 58.0, 2143.0, 5.0, 261.0, 1.0, -1.0))"
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,2.0,"List(0, 11, List(2), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(2, 12, 13, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 44.0, 29.0, 5.0, 151.0, 1.0, -1.0))"
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,7.0,"List(0, 11, List(7), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(7, 11, 13, 16, 17, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 2.0, 5.0, 76.0, 1.0, -1.0))"
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",3.0,"List(0, 3, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(0, 11, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 47.0, 1506.0, 5.0, 92.0, 1.0, -1.0))"
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,11.0,"List(0, 11, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",3.0,"List(0, 3, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(12, 16, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 1.0, 5.0, 198.0, 1.0, -1.0))"
35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no,1.0,"List(0, 11, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(1, 11, 14, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 35.0, 231.0, 5.0, 139.0, 1.0, -1.0))"
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no,1.0,"List(0, 11, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(1, 12, 14, 16, 17, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 28.0, 447.0, 5.0, 217.0, 1.0, -1.0))"
42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no,7.0,"List(0, 11, List(7), List(1.0))",2.0,"List(0, 2, List(), List())",1.0,"List(0, 3, List(1), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(7, 14, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 42.0, 2.0, 5.0, 380.0, 1.0, -1.0))"
58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no,5.0,"List(0, 11, List(5), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(5, 11, 15, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 58.0, 121.0, 5.0, 50.0, 1.0, -1.0))"
43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no,2.0,"List(0, 11, List(2), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 11, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",0.0,"List(0, 42, List(2, 12, 13, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 43.0, 593.0, 5.0, 55.0, 1.0, -1.0))"


In [7]:
# Fit model to prepped data
logisticRegressionModel = LogisticRegression().fit(preppedDataDF)

# ROC for training data
display(logisticRegressionModel, preppedDataDF, "ROC")

False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.9102080989617556
0.0,0.1111111111111111,0.9102080989617556
0.0119047619047619,0.1111111111111111,0.8727759997784728
0.0119047619047619,0.2222222222222222,0.7336536465107624
0.0238095238095238,0.2222222222222222,0.7259079653018172
0.0357142857142857,0.2222222222222222,0.6134736427382863
0.0357142857142857,0.3333333333333333,0.5949502237042397
0.0476190476190476,0.3333333333333333,0.5214377378376418
0.0595238095238095,0.3333333333333333,0.4940334048594755
0.0714285714285714,0.3333333333333333,0.3992205115280179


In [8]:
display(logisticRegressionModel, preppedDataDF)

fitted values,residuals
-4.463155554981384,-0.011394601680481
-2.5914586517811085,-0.0696901540898525
-4.970051147367216,-0.0068949228982034
-4.894938299514752,-0.0074287713183618
-4.464246210287158,-0.0113823222499832
-5.007168775932872,-0.006645360644032
-4.730144389014285,-0.0087479939193287
-2.22271352705246,-0.0977292688833315
-4.38058025598015,-0.0123633275334815
-3.3834761128355844,-0.0328158870851597


In [9]:
# Keep relevant columns
selectedColumns = ["label", "features"] + columns

dataset = preppedDataDF.select(selectedColumns)

display(dataset)

label,features,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0.0,"List(0, 42, List(1, 11, 14, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 58.0, 2143.0, 5.0, 261.0, 1.0, -1.0))",58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
0.0,"List(0, 42, List(2, 12, 13, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 44.0, 29.0, 5.0, 151.0, 1.0, -1.0))",44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
0.0,"List(0, 42, List(7, 11, 13, 16, 17, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 2.0, 5.0, 76.0, 1.0, -1.0))",33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
0.0,"List(0, 42, List(0, 11, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 47.0, 1506.0, 5.0, 92.0, 1.0, -1.0))",47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
0.0,"List(0, 42, List(12, 16, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 33.0, 1.0, 5.0, 198.0, 1.0, -1.0))",33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
0.0,"List(0, 42, List(1, 11, 14, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 35.0, 231.0, 5.0, 139.0, 1.0, -1.0))",35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
0.0,"List(0, 42, List(1, 12, 14, 16, 17, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 28.0, 447.0, 5.0, 217.0, 1.0, -1.0))",28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
0.0,"List(0, 42, List(7, 14, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 42.0, 2.0, 5.0, 380.0, 1.0, -1.0))",42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
0.0,"List(0, 42, List(5, 11, 15, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 58.0, 121.0, 5.0, 50.0, 1.0, -1.0))",58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
0.0,"List(0, 42, List(2, 12, 13, 16, 17, 18, 20, 21, 32, 35, 36, 37, 38, 39, 40), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 43.0, 593.0, 5.0, 55.0, 1.0, -1.0))",43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [10]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)

print(trainingData.count())
print(testData.count())

In [11]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
logisticRegression = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
logisticRegressionModel = logisticRegression.fit(trainingData)

In [12]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = logisticRegressionModel.transform(testData)

In [13]:
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "age", "job", "marital", "education", "balance")
display(selected)

label,prediction,probability,age,job,marital,education,balance
0.0,1.0,"List(1, 2, List(), List(0.32995060141536453, 0.6700493985846354))",27,blue-collar,married,secondary,1295
0.0,0.0,"List(1, 2, List(), List(0.9541049403202051, 0.04589505967979488))",28,blue-collar,married,secondary,4
0.0,0.0,"List(1, 2, List(), List(0.9725206608306798, 0.027479339169320268))",28,blue-collar,married,secondary,8
0.0,0.0,"List(1, 2, List(), List(0.9687663599537482, 0.03123364004625187))",28,blue-collar,married,secondary,83
0.0,0.0,"List(1, 2, List(), List(0.9899070547085939, 0.01009294529140608))",29,blue-collar,married,secondary,25
0.0,0.0,"List(1, 2, List(), List(0.9747422743479525, 0.02525772565204751))",29,blue-collar,married,secondary,275
0.0,0.0,"List(1, 2, List(), List(0.9824740312824931, 0.017525968717507005))",29,blue-collar,married,secondary,415
0.0,0.0,"List(1, 2, List(), List(0.977441714825347, 0.022558285174653088))",29,blue-collar,married,secondary,1052
0.0,0.0,"List(1, 2, List(), List(0.9770909868533164, 0.022909013146683605))",30,blue-collar,married,secondary,-71
0.0,0.0,"List(1, 2, List(), List(0.9671978536597848, 0.032802146340215145))",30,blue-collar,married,secondary,90


In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [15]:
evaluator.getMetricName()

In [16]:
logisticRegression.explainParams()

In [17]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(logisticRegression.regParam, [0.01, 0.5, 2.0])
             .addGrid(logisticRegression.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(logisticRegression.maxIter, [1, 5, 10])
             .build())

In [18]:
# Create 5-fold CrossValidator
crossValidator = CrossValidator(estimator=logisticRegression, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
crossValidatorModel = crossValidator.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

In [19]:
# Use test set to measure the accuracy of our model on new data
predictions = crossValidatorModel.transform(testData)

In [20]:
# crossValidatorModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

In [21]:
print('Model Intercept: ', crossValidatorModel.bestModel.intercept)

In [22]:
weights = crossValidatorModel.bestModel.coefficients
weights = [(float(w),) for w in weights]  # convert numpy type to float, and to tuple
weightsDF = sqlContext.createDataFrame(weights, ["Feature Weight"])
display(weightsDF)

Feature Weight
-0.2222247276205723
-0.0416248988050477
-0.0024997744071399
0.2014744286358824
-0.0934098791705363
0.4235262987344917
-0.1815665968419371
-0.1780356947015741
0.0454152487350491
-0.3608122814540584


In [23]:
# View best model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "age", "job", "marital", "education", "balance")
display(selected)

label,prediction,probability,age,job,marital,education,balance
0.0,1.0,"List(1, 2, List(), List(0.4105052682633678, 0.5894947317366321))",27,blue-collar,married,secondary,1295
0.0,0.0,"List(1, 2, List(), List(0.9458251953412922, 0.05417480465870783))",28,blue-collar,married,secondary,4
0.0,0.0,"List(1, 2, List(), List(0.9650328278612931, 0.03496717213870693))",28,blue-collar,married,secondary,8
0.0,0.0,"List(1, 2, List(), List(0.9609892808271774, 0.03901071917282252))",28,blue-collar,married,secondary,83
0.0,0.0,"List(1, 2, List(), List(0.9851817607469413, 0.01481823925305861))",29,blue-collar,married,secondary,25
0.0,0.0,"List(1, 2, List(), List(0.9673804235346667, 0.03261957646533326))",29,blue-collar,married,secondary,275
0.0,0.0,"List(1, 2, List(), List(0.9762629137565535, 0.02373708624344653))",29,blue-collar,married,secondary,415
0.0,0.0,"List(1, 2, List(), List(0.9704565528900098, 0.02954344710999016))",29,blue-collar,married,secondary,1052
0.0,0.0,"List(1, 2, List(), List(0.9702419049992519, 0.029758095000748185))",30,blue-collar,married,secondary,-71
0.0,0.0,"List(1, 2, List(), List(0.9593778948131877, 0.040622105186812286))",30,blue-collar,married,secondary,90
