In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
data = spark.read.parquet("/user/pavel.klemenkov/lectures/lecture05/credit_fraud/")

In [None]:
data.printSchema()

We'll be using 3 columns of this dataset:


- `pcaVector`: The PCA transformation of raw transaction data. For this example we'll assume that this PCA transformation occurs as part of some data pipeline before the data reaches us.
- `amountRange`: A value between 0 and 7. The approximate amount of a transaction. The values correspond to 0-1, 1-5, 5-10, 10-20, 20-50, 50-100, 100-200, and 200+ in dollars.
- `label`: 0 or 1. Indicates whether a transaction was fraudulent.

We want to build a model that will predict the label using the `pcaVector` and `amountRange` data. We'll do this by using a pipeline with 3 stages.


1. A `OneHotEncoderEstimator` to build a vector from the `amountRange` column. 
2. A `Vector` assembler to merge our `pcaVector` and `amountRange` vector into our `features` vector. 
3. A `GBTClassifier` to server as our `Estimator`.

Let's start by creating the objects that represent these stages.

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import GBTClassifier

In [None]:
one_hot = OneHotEncoderEstimator(inputCols=["amountRange"], outputCols=["amountVect"])

In [None]:
assembler = VectorAssembler(inputCols=["amountVect", "pcaVector"], outputCol="features")

In [None]:
estimator = GBTClassifier()

In [None]:
from pyspark.ml.feature import VectorSizeHint

In [None]:
hint = VectorSizeHint(inputCol="pcaVector", size=28)

In [None]:
train, test = data.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
  one_hot,
  hint,
  assembler,
  estimator
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
testDataPath = "/user/pavel.klemenkov/lectures/lecture05/credit-card-fraud-test-data"
test.repartition(20).write.mode("overwrite").parquet(testDataPath)

In [None]:
from pyspark.sql.types import *
from pyspark.ml.linalg import VectorUDT

In [None]:
schema = StructType(fields=[
  StructField("time", IntegerType()),
  StructField("amountRange", IntegerType()),
  StructField("label", IntegerType()),
  StructField("pcaVector", VectorUDT())
])

In [None]:
streamingData = spark.readStream\
                     .schema(schema)\
                     .option("maxFilesPerTrigger", 1)\
                     .parquet(testDataPath)

In [None]:
streamingRates = pipeline_model.transform(streamingData).groupBy("label").count()

In [None]:
streamingRates

In [None]:
streaming_query = streamingRates.writeStream\
                                .format("memory")\
                                .outputMode("complete")\
                                .queryName("labels")\
                                .start()

In [None]:
import time
from IPython.display import clear_output

In [None]:
while True:
    clear_output()
    spark.sql("select * from labels").show()
    time.sleep(5)

In [None]:
streaming_query.isActive

In [None]:
streaming_query.stop()