In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline
from conf import catalog, paths

# import hyperopt as hp
from hyperopt import Trials, fmin, hp, tpe, STATUS_OK
import mlflow
import os


In [2]:
spark = (
    SparkSession.builder.appName("Next Watch ML")
    .master("local[3]")
    # .config("spark.executor.memory", "3g")
    .config("spark.driver.maxResultSize", "96g")
    .config("spark.driver.memory", "96g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)  # dark


23/05/20 18:09:42 WARN Utils: Your hostname, bruno resolves to a loopback address: 127.0.1.1; using 192.168.1.150 instead (on interface wlp0s20f3)
23/05/20 18:09:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/20 18:09:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
MLFLOW_TRACKING_URI = "file:///home/bruno/mlops-project/mlops-next-watch/mldata"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("movie-recommendations")


<Experiment: artifact_location='file:///home/bruno/mlops-project/mlops-next-watch/mldata/699812880901745651', creation_time=1684077031098, experiment_id='699812880901745651', last_update_time=1684077031098, lifecycle_stage='active', name='movie-recommendations', tags={}>

In [4]:
ratings_path = paths.get_path(
    paths.DATA_01EXTERNAL,
    catalog.Sources.MOVIELENS,
    catalog.Datasets.RATINGS,
    suffix=catalog.FileFormat.CSV,
)  # MOVIE

In [5]:
ratings = spark.read.load(str(ratings_path), format="csv", header=True, inferSchema=True)


                                                                                

In [6]:
ratings.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [7]:
ratings, serve = ratings.randomSplit([0.2, 0.80], seed=42)


In [8]:
ratings.count()


                                                                                

5555688

In [9]:
ratings.show(5)



[Stage 5:>                                                          (0 + 1) / 1]

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|   2134|   4.5|1256677464|
|     2|    170|   3.5|1192913581|
|     2|   1186|   3.5|1192913611|
|     2|   2352|   4.0|1192913575|
|     3|   1645|   4.0| 945141611|
+------+-------+------+----------+
only showing top 5 rows




                                                                                

In [10]:
from collaborative.pre_processing.pre_processing_nodes import PreProcessingNodes


In [11]:
preprocnodes = PreProcessingNodes(spark, catalog.Sources.MOVIELENS)

preprocnodes = preprocnodes.drop_column(
    ratings, catalog.Datasets.RATINGS, catalog.DatasetType.TRAIN, "timestamp", "rating"
)

                                                                                

In [None]:
# Drop
def drops(*columns):
    processed_ratings = ratings.drop(*columns)
    return processed_ratings


processed_ratings = drops("timestamp")  # "rating",

train, test = processed_ratings.randomSplit([0.80, 0.20], seed=42)  # .show(3)
# train.cache()
# test.cache()

In [None]:
train


In [None]:
import evidently
from evidently.test_suite import TestSuite
from evidently.test_preset import DataQualityTestPreset

data_quality = TestSuite(tests=[DataQualityTestPreset()])

data_quality.run(
    reference_data=None, current_data=train.toPandas()
)  # so para ver se o ds de treino esta OK
data_quality.show(mode="inline")


In [None]:
# train.show(3)


In [None]:
mlflow.spark.autolog(disable=True)


In [None]:
# with mlflow.start_run(run_name="als_baseline") as run:
#     params = {
#         "ranks": [8, 10, 12, 15, 18, 20],
#         "reg_params": [0.001, 0.01, 0.05, 0.1, 0.2],
#         "cold_start_strategy": ["drop"],  # ["nan", "drop"]
#         "max_iter": [5],
#         "train_ratio": 0.800,
#     }

#     mlflow.set_tag("model_name", "ALS")
#     mlflow.log_params(params)

#     als = ALS(
#         userCol="userId",
#         itemCol="movieId",
#         ratingCol="rating",
#     )

#     param_grid = (
#         ParamGridBuilder()
#         .addGrid(als.regParam, params["reg_params"])
#         .addGrid(als.rank, params["ranks"])
#         .addGrid(als.maxIter, params["max_iter"])
#         .addGrid(als.coldStartStrategy, params["cold_start_strategy"])
#         .build()
#     )

#     evaluator = RegressionEvaluator(
#         metricName="rmse", labelCol="rating", predictionCol="prediction"
#     )

#     als_model = Pipeline(stages=[als])

#     train_val_split = TrainValidationSplit(
#         estimator=als_model,
#         estimatorParamMaps=param_grid,
#         evaluator=evaluator,
#         trainRatio=params["train_ratio"],
#     )

#     model = train_val_split.fit(train)

#     predictions = model.transform(test)

#     rmse = evaluator.evaluate(predictions)

#     mlflow.log_metric("test_rmse", rmse)
#     mlflow.spark.log_model(als_model, "spark_models")

#     predictions.select("userId", "movieId", "rating", "prediction").show()


In [None]:
train, val = train.randomSplit([0.80, 0.20], seed=42)


def train_als(train, val):
    def train_wrapper(params):
        with mlflow.start_run(nested=True):
            mlflow.set_tag("model", "ALS")
            mlflow.set_tag(
                "mlflow.runName",
                f"als_rank_{params['rank']}_reg_{params['reg_param']:4f}",
            )
            mlflow.log_params(params)
            # mlflow.log_params({"train-data-path", "" "val-data-path", ""}) # TODO

            als = ALS(
                userCol="userId",
                itemCol="movieId",
                ratingCol="rating",
                rank=params["rank"],
                regParam=params["reg_param"],
                coldStartStrategy=params["cold_start_strategy"],
                maxIter=params["max_iter"],
            )

            evaluator = RegressionEvaluator(
                metricName="rmse", labelCol="rating", predictionCol="prediction"
            )

            als_model = Pipeline(stages=[als])

            model = als_model.fit(train)

            predictions = model.transform(val)

            rmse = evaluator.evaluate(predictions)

            mlflow.log_metric("rmse", rmse)
            mlflow.spark.log_model(
                model,
                "als_model",
                # artifact_path="als_model_teste",
                registered_model_name="spark_als_model",
            )

            # TODO: log pre processing...

            return {"loss": rmse, "params": params, "status": STATUS_OK}

    return train_wrapper


In [None]:
trials = Trials()  # TODO: use `SparkTrials`` distribute trials to the workers
params = {
    "rank": hp.choice("rank", [8, 10, 12, 15, 18, 20]),
    "reg_param": hp.uniform("reg_param", 0.001, 0.2),
    "cold_start_strategy": hp.choice("cold_start_strategy", ["drop"]),  # ["nan", "drop"]
    "max_iter": hp.choice("max_iter", [5]),
}

with mlflow.start_run(run_name="ALS Hyper Opt") as run:
    best = fmin(
        fn=train_als(train=train, val=val),
        space=params,
        algo=tpe.suggest,
        max_evals=5,
        trials=trials,
    )
    best_trial = sorted(trials.results, key=lambda result: result["loss"])[0]
    mlflow.log_dict(best_trial, "best_params.json")
    # TODO: Log best model and pre processing ...


In [None]:
print(best_trial)
best_trial["loss"]
best_trial["params"]
best_trial["status"]
# rmse = evaluator.evaluate(predictions)


In [None]:
# model.validationMetrics

from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

client = MlflowClient(
    tracking_uri="file:///home/bruno/mlops-project/mlops-next-watch/mldata"
)
runs = client.search_runs(
    experiment_ids="984574620396131386",
    filter_string="metrics.rmse < 1",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"],
)
#
# runs = client.ser


In [None]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")


In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
run_id = "d97c9712282a4cf281f94b65b4ba947e"
model_uri = f"runs:/{run_id}/model"
model_name = "als_rank_8_reg_0.153200"
mlflow.register_model(model_uri=model_uri, name=model_name)


In [None]:
client.search_registered_models()
latest_versions = client.get_latest_versions(name="als_rank_8_reg_0.153200")

for version in latest_versions:
    pass
