In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

spark = SparkSession \
    .builder \
    .appName("Spark ML") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.csv("creditcard.csv", header=True)

In [29]:
colnames = [col.name for col in df.schema.fields]
for col in colnames:
    df = df.withColumn(col, df[col].cast("float"))
    

In [27]:
colnames

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class']

In [3]:
df = VectorAssembler(inputCols=colnames[:-1], outputCol="features").transform(df).select(["features", "Class"])
df = df.withColumnRenamed("Class", "label")
print df.count()
df.persist()

284807


DataFrame[features: vector, label: float]

In [4]:
df.count()

284807

In [5]:
len(df.schema)

2

In [8]:
pos = df.filter(df["label"] == 1)

In [9]:
neg = df.filter(df["label"] == 0)

In [10]:
ratio = df.filter(df["label"] == 1).count() / float(df.filter(df["label"] == 0).count())

In [11]:
neg = neg.sample(False, ratio)

In [12]:
neg.count()

499

In [13]:
pos.count()

492

In [14]:
pos.persist()
neg.persist()

DataFrame[features: vector, label: float]

In [15]:
data = pos.union(neg)

In [16]:
data.count()

991

In [17]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

data_scaled = scaler.fit(data).transform(data)

In [20]:
data_scaled.persist()

DataFrame[features: vector, label: float, scaledFeatures: vector]

In [23]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(data_scaled)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (30,[4,11,14],[0.0511753023478,0.00386523684717,-0.0738534296013])
Intercept: -0.384517250541


In [25]:
data_scaled.head()

Row(features=DenseVector([406.0, -2.3122, 1.952, -1.6099, 3.9979, -0.5222, -1.4265, -2.5374, 1.3917, -2.7701, -2.7723, 3.202, -2.8999, -0.5952, -4.2893, 0.3897, -1.1407, -2.8301, -0.0168, 0.417, 0.1269, 0.5172, -0.035, -0.4652, 0.3202, 0.0445, 0.1778, 0.2611, -0.1433, 0.0]), label=1.0, scaledFeatures=DenseVector([0.0084, -0.4199, 0.5254, -0.2594, 1.2491, -0.1244, -0.8127, -0.435, 0.2883, -1.1881, -0.6119, 1.1486, -0.6346, -0.5554, -0.9207, 0.4014, -0.3293, -0.4751, -0.007, 0.3293, 0.1218, 0.1865, -0.03, -0.3946, 0.5769, 0.0666, 0.3703, 0.2609, -0.3196, 0.0]))