In [15]:
# tell jupyter where pyspark is
import findspark
findspark.init()
import xgboost as xgb

In [50]:
# Import useful stuff
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

# Import Models and support
from pyspark.sql.functions import col, avg
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LinearSVC

from pyspark.ml.classification import MultilayerPerceptronClassifier

In [3]:
# Build a SparkSession; 
spark = SparkSession\
    .builder\
    .appName("Final Project")\
    .getOrCreate()

In [4]:
# Load preprocessed dataset 
df = spark.read.csv("data/std.csv",inferSchema =True,header=True)
df = df.drop('_c0')
n_features = len(df.columns) - 1 

In [5]:
# Create a feature vector from the data
ignore = ['loan_status']
assembler = VectorAssembler(
    inputCols=[x for x in df.columns if x not in ignore],
    outputCol='features')

df = assembler.transform(df)
df = df.select(['loan_status','features'])

In [6]:
# Let's first split into a test and training set
(trainingData, testData) = df.randomSplit([0.8, 0.2])

# Now let's create all our models (Logistic Regression, GBT, Linear SVC, and MLP)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8,\
                        labelCol = 'loan_status', featuresCol = 'features')

gbt = GBTClassifier(maxIter=10,\
                    labelCol = 'loan_status', featuresCol = 'features')

lsvc = LinearSVC(maxIter=10, regParam=0.1,\
                 labelCol = 'loan_status', featuresCol = 'features')

mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[n_features, 5, 4, 2], blockSize=128,seed=1234,\
                                     labelCol = 'loan_status', featuresCol = 'features')

In [7]:
# Train all the models
lr_model = lr.fit(trainingData)
print('Logistic Regression training complete')

gbt_model = gbt.fit(trainingData)
print('Gradient Boosted training complete')

lsvc_model = lsvc.fit(trainingData)
print('Linear SVC training complete')

mlp_model = mlp.fit(trainingData)
print('Multi-layer Perceptron training complete')

Logistic Regression training complete
Gradient Boosted training complete
Linear SVC training complete
Multi-layer Perceptron training complete


In [8]:
# Make predictions with all the models
lr_predictions = lr_model.transform(testData)
gbt_predictions = gbt_model.transform(testData)
lsvc_predictions = lsvc_model.transform(testData)
mlp_predictions = mlp_model.transform(testData)

In [9]:
# Compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol='loan_status', predictionCol="prediction", metricName="accuracy")

lr_accuracy = evaluator.evaluate(lr_predictions)
print("Logistic Regression Test Error = %g" % (1.0 - lr_accuracy))

gbt_accuracy = evaluator.evaluate(gbt_predictions)
print("Gradient Boosted Test Error = %g" % (1.0 - gbt_accuracy))

lsvc_accuracy = evaluator.evaluate(lsvc_predictions)
print("Linear SVC Test Error = %g" % (1.0 - lsvc_accuracy))

mlp_accuracy = evaluator.evaluate(mlp_predictions)
print("Multi-layer Perceptron Test Error = %g" % (1.0 - mlp_accuracy))

Logistic Regression Test Error = 0.0753789
Gradient Boosted Test Error = 0.0422353
Linear SVC Test Error = 0.0590807
Multi-layer Perceptron Test Error = 0.0754032


Now we stack the models. To do this need to use out-of-fold predictions. Pyspark sucks and I have no idea how to implement this in pyspark. 

*** CREATE EMPTY NP ARRAYS/VECTORS ***

*** NEED TO REINTRODUCE AN INDEX COLUMN TO SAVE INDICES ***

*** AFTER  RANDOM SPLITTING, GET ALL THE INDICES FROM EACH SPLIT ***

*** THIS ALLOWS US TO GENERATE PREDICTIONS BY FILTERING AND STORE THEM APPROPRIATELY ***

*** THEN THESE BITCH ASS ARRAYS CAN BE CONCATED AND SENT TO A CSV FOR 2ND LAYER BULLSHIT ***

*** THEN WE CONVERT THIS NEW FEATURE MATRIX FROM A CSV BACK INTO A PYSPARK DATAFRAME FOR MODELING ***


*** RAW_PREDICTIONS CHOICE IS A VECTOR, NOT SURE HOW TO HANDLE IT, COULD JUST LEAVE IT AS A VECTOR ***

*** FEATURE IMPORTANCE WOULD BE NICE AT THE END TOO ***

In [None]:
# This is to combine dataframes
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

# Out-of-fold predictions (for stacked models, pure prediction)
def get_oof(clf, trainingData, testData, x_test, choice):
    training_splits = trainingData.randomSplit([1.0, 1.0, 1.0], 1234)
    test_splits = testData.randomSplit([1.0, 1.0, 1.0], 1234)

    #oof_train = np.zeros((len(x_train),))
    #oof_test = np.zeros((len(x_test),))
    #oof_test_skf = np.empty((NFOLDS, len(x_test)))

    for i in range(0,len(training_splits)):
        #x_tr = x_train.iloc[train_index,:]
        #y_tr = y_train[train_index]
        #x_te = x_train.iloc[test_index,:]
    
        #clf.train(x_tr, y_tr)
        
        model = clf.fit(training_splits[i])
        oof_train[test_index] = clf.transform().select(choice)
        
    
        if choice == 'predict':
            oof_train[test_index] = clf.predict(x_te)
            oof_test_skf[i,:] = clf.predict(x_test)
        
        elif choice == 'decision_function':
            oof_train[test_index] = clf.decision_function(x_te)
            oof_test_skf[i,:] = clf.decision_function(x_test)
            
        i += 1

    oof_test[:] = oof_test_skf.mean(axis=0)    
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)