# NOTEBOOK 3.4 SparkML
Adapted from: https://github.com/apache/spark/blob/master/examples/src/main/python/ml/estimator_transformer_param_example.py

### Estimator Transformer Param Example.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

In [None]:
spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

24/06/05 10:51:09 WARN Utils: Your hostname, PC25. resolves to a loopback address: 127.0.1.1; using 192.168.76.195 instead (on interface eth0)
24/06/05 10:51:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/05 10:51:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Prepare (fake) training data from a list of (label, features) tuples.
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

In [None]:
 # Create a LogisticRegression instance as the Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

In [None]:
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

                                                                                

In [None]:
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

Model 1 was fit using parameters: 
{Param(parent='LogisticRegression_502cc7ac7531', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LogisticRegression_502cc7ac7531', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_502cc7ac7531', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto', Param(parent='LogisticRegression_502cc7ac7531', name='featuresCol', doc='features column name.'): 'features', Param(parent='LogisticRegression_502cc7ac7531', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LogisticRegression_502cc7ac7531', name='labelCol', doc='label column name.'): 'label', Param(parent='LogisticRegression_502cc7ac7531', name='maxBlockSizeInMB', do

In [None]:
# Prepare (fake) test data
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

In [None]:
# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.
prediction = model1.transform(test)
result = prediction.select("features", "label", "probability", "prediction") \
    .collect()

In [None]:
for row in result:
    print(f"features={row.features}, label={row.label} -> prob={row.probability}, prediction={row.prediction}")

features=[-1.0,1.5,1.3], label=1.0 -> prob=[0.0019392203169556182,0.9980607796830444], prediction=1.0
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.995731919571047,0.004268080428952992], prediction=0.0
features=[0.0,2.2,-1.5], label=1.0 -> prob=[0.012004630236371003,0.987995369763629], prediction=1.0


In [None]:
spark.stop()