In [1]:
# tell jupyter where pyspark is
import findspark
findspark.init()

In [2]:
# Import useful stuff
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import Models and support
from pyspark.sql.functions import col, avg
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.classification import MultilayerPerceptronClassifier

In [3]:
# Build a SparkSession; 
spark = SparkSession\
    .builder\
    .appName("Final Project")\
    .getOrCreate()

In [4]:
# Load preprocessed dataset 
df = spark.read.csv("data/oversampled_GLM_VIF_heat.csv",inferSchema =True,header=True)
df = df.drop('_c0')
n_features = len(df.columns) - 1 

In [5]:
# Create a feature vector from the data
ignore = ['loan_status']
assembler = VectorAssembler(
    inputCols=[x for x in df.columns if x not in ignore],
    outputCol='features')

df = assembler.transform(df)
df = df.select(['loan_status','features'])

In [6]:
# Let's first split into a test and training set
(trainingData, testData) = df.randomSplit([0.8, 0.2])

In [7]:
# Now let's create all our models (Logistic Regression, GBT, Linear SVC, and MLP)
lr = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=0.8,\
                        labelCol = 'loan_status', featuresCol = 'features')

dt = DecisionTreeClassifier(labelCol='loan_status', featuresCol="features")

rf = RandomForestClassifier(numTrees=10,\
                            labelCol="loan_status", featuresCol="features")

gbt = GBTClassifier(maxIter=10,\
                    labelCol = 'loan_status', featuresCol = 'features')

lsvc = LinearSVC(maxIter=10, regParam=0.1,\
                 labelCol = 'loan_status', featuresCol = 'features')

mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[3, 5, 4, 2], blockSize=128,seed=1234,\
                                     labelCol = 'loan_status', featuresCol = 'features')

In [None]:
# This is to combine dataframes
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

# This is for generating OOF training sets
from itertools import combinations

# first split training set into k-folds
training_splits = trainingData.randomSplit([1.0, 1.0, 1.0], 1234)

# generate list of training folds
fold_training = []
for folds in combinations([2,1,0], 2):
    fold_training.append(unionAll(training_splits[folds[0]], training_splits[folds[1]]))

In [None]:
# OOF function that takes the clf and the training_splits list as inputs
# and outputs the OOF predictions in a pandas dataframe

def oof(clf, fold_training, training_splits, testData):
    # fit k-1 folds on 1st stage model and predict on kth fold for all folds
    fold_prediction = []
    test_prediction = []
    for i in range(0, 3):
        model = clf.fit(fold_training[i])
        fold_prediction.append(model.transform(training_splits[i]).select(['loan_status', 'prediction']).toPandas())
        test_prediction.append(model.transform(testData).select('prediction').toPandas())
    
    # we now have a list of pandas dataframes which we will concat and average (for test set)
    layer2_training = pd.concat(fold_prediction)
    layer2_test_predictions = pd.concat(test_prediction, axis=1).mean(axis=1)
    test_loans = testData.select('loan_status').toPandas()
    layer2_test = pd.concat([test_loans, layer2_test_predictions], axis=1)
    
    return layer2_training, layer2_test

Mixed models layer1

In [None]:
lr_layer2_training, lr_layer2_test = oof(lr, fold_training, training_splits, testData)
print('Logistic Regression OOF predictions complete.')

gbt_layer2_training, gbt_layer2_test = oof(gbt, fold_training, training_splits, testData)
print('Gradient-Boosted OOF predictions complete.')

lsvc_layer2_training, lsvc_layer2_test = oof(lsvc, fold_training, training_splits, testData)
print('Linear SVC OOF predictions complete.')

# concat them all
layer2_training = pd.concat([lr_layer2_training, gbt_layer2_training, lsvc_layer2_training], axis=1)
layer2_test = pd.concat([lr_layer2_test, gbt_layer2_test, lsvc_layer2_test], axis=1)

# send to csv
layer2_training.to_csv('data/stacking/layer2_oversampled_training_mixed_new.csv')
print('Layer 2 training set csv written.')

layer2_test.to_csv('data/stacking/layer2_oversampled_test_mixed_new.csv')
print('Layer 2 test set csv written.')

Tree models layer1

In [None]:
dt_layer2_training, dt_layer2_test = oof(dt, fold_training, training_splits, testData)
print('Decision Tree OOF predictions complete.')

rf_layer2_training, rf_layer2_test = oof(rf, fold_training, training_splits, testData)
print('Random Forest OOF predictions complete.')

#gbt_layer2_training, gbt_layer2_test = oof(gbt, fold_training, training_splits, testData)
#print('Gradient-Boosted OOF predictions complete.')

# concat them all
layer2_training = pd.concat([dt_layer2_training, rf_layer2_training, gbt_layer2_training], axis=1)
layer2_test = pd.concat([dt_layer2_test, rf_layer2_test, gbt_layer2_test], axis=1)

# send to csv
layer2_training.to_csv('data/stacking/layer2_oversampled_training_trees_new.csv')
print('Layer 2 trees training set csv written.')

layer2_test.to_csv('data/stacking/layer2_oversampled_test_trees_new.csv')
print('Layer 2 trees test set csv written.')

Read and reformat data suitable for modeling

In [8]:
# layer2 into pyspark dataframes again (remove index column again and clean names/filter)
l2_train_df = spark.read.csv("data/stacking/layer2_oversampled_training_mixed_new.csv",inferSchema =True,header=True)
l2_train_df = l2_train_df.drop('_c0')
l2_train_df = l2_train_df.select(col('loan_status1').alias('loan_status'), col('prediction2').alias('feature1'),\
                                col('prediction4').alias('feature2'), col('prediction6').alias('feature3'))

l2_test_df = spark.read.csv("data/stacking/layer2_oversampled_test_mixed_new.csv",inferSchema =True,header=True)
l2_test_df = l2_test_df.drop('_c0')
l2_test_df = l2_test_df.select(col('loan_status1').alias('loan_status'), col('02').alias('feature1'),\
                               col('04').alias('feature2'), col('06').alias('feature3'))

In [9]:
# Vector assembler again
ignore = ['loan_status']
assembler = VectorAssembler(
    inputCols=[x for x in l2_train_df.columns if x not in ignore],
    outputCol='features')

train_df = assembler.transform(l2_train_df)
train_df = train_df.select(['loan_status','features'])

test_df = assembler.transform(l2_test_df)
test_df = test_df.select(['loan_status','features'])

In [10]:
# Convert to DenseVector since VectorAssembler optimized some entries out
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F

ud_f = F.udf(lambda r : Vectors.dense(r),VectorUDT())

train_df = train_df.withColumn('features_array',ud_f('features'))
train_df = train_df.select('loan_status', col('features_array').alias('features'))

test_df = test_df.withColumn('features_array',ud_f('features'))
test_df = test_df.select('loan_status', col('features_array').alias('features'))

Train our models and predict on test set

In [11]:
mlp_model = mlp.fit(train_df)
print('Multi-layer Perceptron training complete')

mlp_predictions = mlp_model.transform(test_df)

# Compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol='loan_status', predictionCol="prediction", metricName="accuracy")

mlp_accuracy = evaluator.evaluate(mlp_predictions)
print("Mixed Stacked Multi-layer Perceptron Test Error = %g" % (1.0 - mlp_accuracy))

# Compute f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol='loan_status', predictionCol="prediction", metricName="f1")

mlp_f1 = f1_evaluator.evaluate(mlp_predictions)
print("Mixed Stacked Multi-layer Perceptron F1 Score = %g" % mlp_f1)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Compute auc score
auc_evaluator = BinaryClassificationEvaluator(
    labelCol='loan_status', rawPredictionCol="prediction", metricName="areaUnderROC")

mlp_auc = auc_evaluator.evaluate(mlp_predictions)
print("Mixed Stacked Multi-layer Perceptron AUC Score = %g" % mlp_auc)

Multi-layer Perceptron training complete
Mixed Stacked Multi-layer Perceptron Test Error = 0.0639938
Mixed Stacked Multi-layer Perceptron F1 Score = 0.935998
Mixed Stacked Multi-layer Perceptron AUC Score = 0.936004


Now lets do the tree based layer1 with the same layer2.

In [12]:
# layer2 into pyspark dataframes again (remove index column again and clean names/filter)
l2_train_df = spark.read.csv("data/stacking/layer2_oversampled_training_trees_new.csv",inferSchema =True,header=True)
l2_train_df = l2_train_df.drop('_c0')
l2_train_df = l2_train_df.select(col('loan_status1').alias('loan_status'), col('prediction2').alias('feature1'),\
                                col('prediction4').alias('feature2'), col('prediction6').alias('feature3'))

l2_test_df = spark.read.csv("data/stacking/layer2_oversampled_test_trees_new.csv",inferSchema =True,header=True)
l2_test_df = l2_test_df.drop('_c0')
l2_test_df = l2_test_df.select(col('loan_status1').alias('loan_status'), col('02').alias('feature1'),\
                               col('04').alias('feature2'), col('06').alias('feature3'))

In [13]:
# Vector assembler again
ignore = ['loan_status']
assembler = VectorAssembler(
    inputCols=[x for x in l2_train_df.columns if x not in ignore],
    outputCol='features')

train_df = assembler.transform(l2_train_df)
train_df = train_df.select(['loan_status','features'])

test_df = assembler.transform(l2_test_df)
test_df = test_df.select(['loan_status','features'])

In [14]:
# Convert to DenseVector since VectorAssembler optimized some entries out
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F

ud_f = F.udf(lambda r : Vectors.dense(r),VectorUDT())

train_df = train_df.withColumn('features_array',ud_f('features'))
train_df = train_df.select('loan_status', col('features_array').alias('features'))

test_df = test_df.withColumn('features_array',ud_f('features'))
test_df = test_df.select('loan_status', col('features_array').alias('features'))

In [15]:
mlp_model = mlp.fit(train_df)
print('Multi-layer Perceptron training complete')

mlp_predictions = mlp_model.transform(test_df)

# Compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol='loan_status', predictionCol="prediction", metricName="accuracy")

mlp_accuracy = evaluator.evaluate(mlp_predictions)
print("Tree Stacked Multi-layer Perceptron Layer 2 Test Error = %g" % (1.0 - mlp_accuracy))

# Compute f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol='loan_status', predictionCol="prediction", metricName="f1")

mlp_f1 = f1_evaluator.evaluate(mlp_predictions)
print("Tree Stacked Multi-layer Perceptron F1 Score = %g" % mlp_f1)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Compute auc score
auc_evaluator = BinaryClassificationEvaluator(
    labelCol='loan_status', rawPredictionCol="prediction", metricName="areaUnderROC")

mlp_auc = auc_evaluator.evaluate(mlp_predictions)
print("Tree Stacked Multi-layer Perceptron AUC Score = %g" % mlp_auc)

Multi-layer Perceptron training complete
Tree Stacked Multi-layer Perceptron Layer 2 Test Error = 0.057365
Tree Stacked Multi-layer Perceptron F1 Score = 0.942635
Tree Stacked Multi-layer Perceptron AUC Score = 0.942635
