In [0]:
import os
import sys


import pyspark
from pyspark.ml import PipelineModel
from pyspark.ml.feature import FeatureHasher
import papermill as pm
import scrapbook as sb

from recommenders.utils.notebook_utils import is_databricks
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.datasets.criteo import load_spark_df
from recommenders.datasets.spark_splitters import spark_random_split

# Setup MML Spark
#from recommenders.utils.spark_utils import MMLSPARK_REPO, MMLSPARK_PACKAGE
#print(MMLSPARK_REPO)
#print(MMLSPARK_PACKAGE)
#packages = [MMLSPARK_PACKAGE]
#repos = [MMLSPARK_REPO]
#spark = start_or_get_spark(packages=packages, repositories=repos)
#dbutils = None
#print("MMLSpark version: {} from {}".format(MMLSPARK_PACKAGE,MMLSPARK_REPO))
#from mmlspark.train import ComputeModelStatistics
#from mmlspark.lightgbm import LightGBMClassifier

# Synapse.ml is the new version of mml
import synapse.ml
from synapse.ml.train import ComputeModelStatistics
from synapse.ml.lightgbm import LightGBMClassifier

print("System version: {}".format(sys.version))
print("PySpark version: {}".format(pyspark.version.__version__))

System version: 3.8.10 (default, Nov 26 2021, 20:14:08) 
[GCC 9.3.0]
PySpark version: 3.1.2


In [0]:
#!pip list

Package             Version
------------------- --------------------
ansiwrap            0.8.4
appdirs             1.4.4
argon2-cffi         20.1.0
async-generator     1.10
attrs               20.3.0
backcall            0.2.0
bleach              3.3.0
boto3               1.16.7
botocore            1.19.7
Bottleneck          1.3.2
category-encoders   1.3.0
certifi             2020.12.5
cffi                1.14.5
chardet             4.0.0
click               8.0.3
cornac              1.14.1
cycler              0.10.0
Cython              0.29.23
dbus-python         1.2.16
decorator           5.0.6
defusedxml          0.7.1
distlib             0.3.2
distro              1.4.0
distro-info         0.23ubuntu1
entrypoints         0.3
facets-overview     1.0.0
filelock            3.0.12
huggingface-hub     0.4.0
hypothesis          6.36.1
idna                2.10
ipykernel           5.3.4
ipython             7.22.0
ipython-genutils    0.2.0
ipywidgets         

In [0]:
# Criteo data size, it can be "sample" or "full"
DATA_SIZE = "sample"

# LightGBM parameters
# More details on parameters: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
NUM_LEAVES = 32
NUM_ITERATIONS = 50
LEARNING_RATE = 0.1
FEATURE_FRACTION = 0.8
EARLY_STOPPING_ROUND = 10

# Model name
MODEL_NAME = 'lightgbm_criteo.mml'

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSessionLightGBM").getOrCreate()
#spark.version()

from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)

dbutils.widgets.help()

In [0]:
raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)
# visualize data
raw_data.limit(2).toPandas().head()

  0%|          | 0.00/8.58k [00:00<?, ?KB/s]  0%|          | 13.0/8.58k [00:00<01:19, 108KB/s]  0%|          | 41.0/8.58k [00:00<00:42, 199KB/s]  1%|          | 68.0/8.58k [00:00<00:52, 163KB/s]  1%|          | 98.0/8.58k [00:00<00:48, 175KB/s]  2%|▏         | 140/8.58k [00:00<00:40, 210KB/s]   2%|▏         | 173/8.58k [00:00<00:39, 211KB/s]  2%|▏         | 210/8.58k [00:01<00:37, 221KB/s]  3%|▎         | 253/8.58k [00:01<00:34, 239KB/s]  3%|▎         | 286/8.58k [00:01<00:35, 231KB/s]  4%|▍         | 324/8.58k [00:01<00:35, 235KB/s]  5%|▍         | 427/8.58k [00:01<00:22, 367KB/s]  7%|▋         | 596/8.58k [00:01<00:13, 588KB/s] 10%|▉         | 818/8.58k [00:01<00:09, 844KB/s] 17%|█▋        | 1.44k/8.58k [00:02<00:03, 1.81kKB/s] 24%|██▍       | 2.10k/8.58k [00:02<00:02, 2.54kKB/s] 43%|████▎     | 3.66k/8.58k [00:02<00:01, 4.81kKB/s] 57%|█████▋    | 4.87k/8.58k [00:02<00:00, 5.72kKB/s] 75%|███████▍  | 6.43k/8.58k [00:02<00:00, 7.03kKB/s] 89%|████████▉ | 7.66k/8.58k 

Unnamed: 0,label,int00,int01,int02,int03,int04,int05,int06,int07,int08,...,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25
0,0,1,1,5,0,1382,4,15,2,181,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2,0,44,1,102,8,2,2,4,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655


In [0]:
raw_train, raw_test = spark_random_split(raw_data, ratio=0.8, seed=42)

In [0]:
columns = [c for c in raw_data.columns if c != 'label']
feature_processor = FeatureHasher(inputCols=columns, outputCol='features')

In [0]:
train = feature_processor.transform(raw_train)
test = feature_processor.transform(raw_test)

In [0]:
lgbm = LightGBMClassifier(
    labelCol="label",
    featuresCol="features",
    objective="binary",
    isUnbalance=True,
    boostingType="gbdt",
    boostFromAverage=True,
    baggingSeed=42,
    numLeaves=NUM_LEAVES,
    numIterations=NUM_ITERATIONS,
    learningRate=LEARNING_RATE,
    featureFraction=FEATURE_FRACTION,
    earlyStoppingRound=EARLY_STOPPING_ROUND
)

In [0]:
model = lgbm.fit(train)

In [0]:
predictions = model.transform(test)

In [0]:
evaluator = (
    ComputeModelStatistics()
    .setScoredLabelsCol("prediction")
    .setLabelCol("label")
    .setEvaluationMetric("AUC")
)

result = evaluator.transform(predictions)
auc = result.select("AUC").collect()[0][0]
result.show()

+---------------+------------------+
|evaluation_type|               AUC|
+---------------+------------------+
| Classification|0.6213613998849201|
+---------------+------------------+



In [0]:
# Record results with papermill for tests
sb.glue("auc", auc)

In [0]:
# save model
pipeline = PipelineModel(stages=[feature_processor, model])
pipeline.write().overwrite().save(MODEL_NAME)

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-1623646647458095>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0;31m# save model[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mpipeline[0m [0;34m=[0m [0mPipelineModel[0m[0;34m([0m[0mstages[0m[0;34m=[0m[0;34m[[0m[0mfeature_processor[0m[0;34m,[0m [0mmodel[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mpipeline[0m[0;34m.[0m[0mwrite[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0moverwrite[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0msave[0m[0;34m([0m[0mMODEL_NAME[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/ml/util.py[0m in [0;36msave[0;34m(self, path)[0m
[1;32m    306[0m         [0;32mif[0m [0;32mnot[0m [0misinstance[0m[0;34m([0m[0mpath[0m[0;34m,[0m [0mstr[0m[0