# NOTEBOOK 3.6 SparkML Pipeline
Adapted from: examples/src/main/python/ml/pipeline_example.py

### Pipeline Example.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [None]:
spark = SparkSession\
        .builder\
        .appName("PipelineExample")\
        .getOrCreate()

24/06/05 10:52:08 WARN Utils: Your hostname, PC25. resolves to a loopback address: 127.0.1.1; using 192.168.76.195 instead (on interface eth0)
24/06/05 10:52:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/05 10:52:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Prepare (fake) training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [None]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [None]:
# Fit the pipeline to training documents.
model = pipeline.fit(training)

                                                                                

In [None]:
# Prepare (fake) test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [None]:
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
    print(f"({rid}, {text}) --> prob={str(prob)}, prediction={prediction}")

(4, spark i j k) --> prob=[0.6292098489668484,0.3707901510331516], prediction=0.0
(5, l m n) --> prob=[0.984770006762304,0.015229993237696027], prediction=0.0
(6, spark hadoop spark) --> prob=[0.13412348342566097,0.8658765165743391], prediction=1.0
(7, apache hadoop) --> prob=[0.9955732114398529,0.00442678856014711], prediction=0.0


In [None]:
spark.stop()