# Logistic Regression Classification

In [0]:
# this allows pyspark.ml for accelerated estimators to import the accelerated versions
# comment out or skip for CPU only runs
import spark_rapids_ml.install

In [0]:
import pandas as pd
from gen_data_distributed import ClassificationDataGen, SparseRegressionDataGen
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import time

Disable mlflow auto logging as it is resource intensive.

In [0]:
import mlflow
mlflow.autolog(disable=True)

### Create synthetic dataset

The number of classes in the dataset is set to 2 below.  Larger values for `n_classes` are also supported.

In [0]:
n_classes = 2

In [0]:
#n_rows = 20000000000
n_rows = 20000000000
n_cols = 3
n_files = 2000
output = f"s3://eordentlich/polynomial-lr/{int(n_rows/1e9)}bx{n_cols}"
data_generator = ClassificationDataGen(argv=["--no_shutdown", "--num_rows", f"{n_rows}", "--num_cols", f"{n_cols}", 
                                             "--output_num_files", f"{n_files}", "--output_dir", output, "--n_redundant", "1"])

generated_df = data_generator.gen_dataframe(spark)

Passing {'n_redundant': 1, 'random_state': 1} to make_classification


In [0]:
n_rows = 20000000000
n_cols = 300
n_files = 2000
output = f"s3://eordentlich/polynomial-lr/{int(n_rows/1e9)}bx{n_cols}_sparse"
data_generator = SparseRegressionDataGen(argv=["--no_shutdown", "--num_rows", f"{n_rows}", "--num_cols", f"{n_cols}", 
                                             "--output_num_files", f"{n_files}", "--output_dir", output, "--n_redundant", "1",
                                             "--density_curve", "Linear",
                                             ])

generated_df = data_generator.gen_dataframe(spark)

In [0]:
generated_df[0].rdd.getNumPartitions()

2000

In [0]:
generated_df[0].write.parquet(output)

In [0]:
output

's3://eordentlich/polynomial-lr/10bx4'

### Load dataset

In [0]:
df = spark.read.parquet('s3://eordentlich/polynomial-lr/10bx4/part-00{0,1}*','s3://eordentlich/polynomial-lr/10bx4/part-002{0,1,2,3,4}*')
df.count()

1250000000

In [0]:
df

DataFrame[c0: float, c1: float, c2: float, c3: float, label: float]

In [0]:
feature_cols=df.columns
feature_cols.remove("label")
feature_cols

['c0', 'c1', 'c2', 'c3']

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion

# this is a transform only pipeline so not dataprocessing is actually run
pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=feature_cols, outputCol="features"),
    PolynomialExpansion(inputCol="features", outputCol="expanded_features", degree=2),
]).fit(df)

In [0]:
# keep only expanded features for training
train_df = pipeline.transform(df).drop(*(feature_cols + ["features"]))

In [0]:
train_df

DataFrame[label: float, expanded_features: vector]

In [0]:
train_df=train_df.withColumnRenamed("expanded_features", "features")
train_df.take(5)

[Row(label=0.0, features=DenseVector([-2.2766, 5.1828, -3.0791, 7.0099, 9.481, 0.242, -0.551, -0.7453, 0.0586, -0.4621, 1.052, 1.4228, -0.1118, 0.2135])),
 Row(label=0.0, features=DenseVector([-0.7339, 0.5386, 0.7457, -0.5472, 0.556, -0.2095, 0.1537, -0.1562, 0.0439, 0.7099, -0.521, 0.5293, -0.1487, 0.5039])),
 Row(label=0.0, features=DenseVector([-1.4893, 2.218, -1.4956, 2.2274, 2.2369, 0.0726, -0.1081, -0.1085, 0.0053, 0.8614, -1.2829, -1.2883, 0.0625, 0.742])),
 Row(label=1.0, features=DenseVector([0.7798, 0.6081, -1.1091, -0.8648, 1.23, 0.275, 0.2144, -0.3049, 0.0756, 0.6444, 0.5025, -0.7146, 0.1772, 0.4152])),
 Row(label=1.0, features=DenseVector([0.8752, 0.766, 1.8113, 1.5852, 3.2807, -0.1968, -0.1723, -0.3565, 0.0387, -0.8866, -0.7759, -1.6058, 0.1745, 0.786]))]

## Train with CrossValidator

In [0]:
def build_lr_classifier(estimator_class):
    return ( estimator_class(verbose=7)
                .setFeaturesCol("features")
                .setLabelCol("label")
                .setRegParam(0.001)
                .setElasticNetParam(0.5)
                .setMaxIter(100)
                .setTol(1.0e-30)
           )

In [0]:
def build_lr_classifier(estimator_class):
    return ( estimator_class()
                .setFeaturesCol("features")
                .setLabelCol("label")
                .setRegParam(0.001)
                .setElasticNetParam(0.5)
                .setMaxIter(100)
                .setTol(1.0e-30)
           )

In [0]:
from pyspark.ml.classification import LogisticRegression
classifier = build_lr_classifier(LogisticRegression)

In [0]:
type(classifier)

spark_rapids_ml.classification.LogisticRegression

In [0]:
model=classifier.fit(train_df)

INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=16, gpu=1.0)
INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 4 worker(s) ...
INFO:spark_rapids_ml.classification.LogisticRegression:Finished training


In [0]:
# gpu accelerated crossvalidation does not yet support AUC (and similar) - will fall back to suboptimal processing, so use logLoss, which is supported.
eval = MulticlassClassificationEvaluator(metricName="logLoss", labelCol="label")

In [0]:
def with_benchmark(phrase, action):
    start = time.time()
    result = action()
    end = time.time()
    print("{} takes {} seconds".format(phrase, end - start))
    return result

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
grid = (
        ParamGridBuilder().addGrid(classifier.regParam, [0.00005, 0.001, 0.005, 0.01])
                            .addGrid(classifier.elasticNetParam, [0.25, 0.5, 0.75, 0.9])
                            .build()
    )

cv = CrossValidator(estimator=classifier, estimatorParamMaps=grid, evaluator=eval, parallelism=1, seed=1)



In [0]:

type(cv)

spark_rapids_ml.tuning.CrossValidator

In [0]:
# gpu 16 param sweeps, 4x g5.4xlarge
model = with_benchmark("Training CV", lambda: cv.fit(train_df))
print(f"average metrics: {model.avgMetrics}")

INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=16, gpu=1.0)
INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 4 worker(s) ...
INFO:spark_rapids_ml.classification.LogisticRegression:Finished training
INFO:spark_rapids_ml.classification.LogisticRegressionModel:CUDA managed memory enabled.
INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=16, gpu=1.0)
INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 4 worker(s) ...
INFO:spark_rapids_ml.classification.LogisticRegression:Finished training
INFO:spark_rapids_ml.classification.LogisticRegressionModel:CUDA managed memory enabled.
INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
IN

Training CV takes 3468.310155391693 seconds
average metrics: [0.0874251412252141, 0.08742422382011435, 0.08742667654949954, 0.08742304175880637, 0.08935976529076677, 0.08886029039149217, 0.08839960716405448, 0.08812804930149491, 0.10306701046433374, 0.10057026574495233, 0.09723698525041587, 0.0943568591232843, 0.11900523566389483, 0.11523139049544802, 0.10892823571240758, 0.10383245881082369]


In [0]:
# 40x g5.2xlarge
model = with_benchmark("Training CV", lambda: cv.fit(train_df))
print(f"average metrics: {model.avgMetrics}")

# INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=8, gpu=1.0)
# INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 40 worker(s) ...
# INFO:spark_rapids_ml.classification.LogisticRegression:Finished training
# INFO:spark_rapids_ml.classification.LogisticRegressionModel:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=8, gpu=1.0)
# INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 40 worker(s) ...
# INFO:spark_rapids_ml.classification.LogisticRegression:Finished training
# INFO:spark_rapids_ml.classification.LogisticRegressionModel:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=8, gpu=1.0)
# INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 40 worker(s) ...
# INFO:spark_rapids_ml.classification.LogisticRegression:Finished training
# INFO:spark_rapids_ml.classification.LogisticRegressionModel:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:CUDA managed memory enabled.
# INFO:spark_rapids_ml.classification.LogisticRegression:Training tasks require the resource(cores=8, gpu=1.0)
# INFO:spark_rapids_ml.classification.LogisticRegression:Training spark-rapids-ml with 40 worker(s) ...
# INFO:spark_rapids_ml.classification.LogisticRegression:Finished training
# Training CV takes 2047.1421689987183 seconds
# average metrics: [0.2756548008318942, 0.2784477526711268, 0.6326207601499868, 0.693147183212503]