In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession \
    .builder \
    .appName("Spark ML") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.csv("creditcard.csv", header=True)

In [2]:
colnames = [col.name for col in df.schema.fields]
for col in colnames:
    df = df.withColumn(col, df[col].cast("float"))

Number of rows:

In [3]:
df.count()

284807

Number of columns:

In [4]:
len(df.schema)

31

Undersample the negative class to reach class balanceness:

In [5]:
pos = df.filter(df["Class"] == 1)
neg = df.filter(df["Class"] == 0)
ratio = pos.count() / float(neg.count())
neg = neg.sample(False, ratio)

In [6]:
print pos.count(), neg.count()

492 490


Combine positive and negative class and split into training and set

In [7]:
data = pos.union(neg)

data = data.withColumnRenamed("Class", "label")

train, test = data.randomSplit([0.8, 0.2], seed=123)

Create pipeline that first assemble the individual values into vectors, and standardize the matrix with zero mean and unit standar deviation, finally trains the logistic regression model.

In [8]:
assembler = VectorAssembler(inputCols=colnames[:-1],
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

lr = LogisticRegression(maxIter=10,
                        regParam=0.3, elasticNetParam=0.8)

pipeline = Pipeline(stages=[assembler, scaler, lr])

pipelineModel = pipeline.fit(train)

testPred = pipelineModel.transform(test)

Spark ML supports only area under ROC, and area under Precision-recall curve

In [9]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.explainParams()

'labelCol: label column name. (default: label)\nmetricName: metric name in evaluation (areaUnderROC|areaUnderPR) (default: areaUnderROC)\nrawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)'

Evaluating predicted labels:

In [10]:
print "Area under PR:", evaluator.evaluate(testPred, {evaluator.metricName: "areaUnderPR"})

print "Area under ROC:", evaluator.evaluate(testPred, {evaluator.metricName: "areaUnderROC"})

pd_df = testPred.toPandas()

print "Accuracy:", (pd_df["prediction"] == pd_df["label"]).mean()

Area under PR: 0.981362982758
Area under ROC: 0.970281790197
Accuracy: 0.886486486486
