In [2]:
!python -V

Python 3.11.9


### Import packages 

In [3]:
import pandas as pd

# Set the float format to display numbers without scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Package: pickle
# Purpose: This package provides functionality for working with pickle files.
# Pickle is a Python module used for serializing and deserializing Python objects.
# It allows you to save and load Python objects to and from disk, preserving their state.

import pickle
# Package: seaborn
# Purpose: Seaborn is a data visualization library based on Matplotlib.
# It provides a high-level interface for creating informative and attractive statistical graphics.
import seaborn as sns

# Package: matplotlib.pyplot
# Purpose: Matplotlib is a plotting library for creating static, animated, and interactive visualizations in Python. 
# pyplot is a collection of functions that provide a simple interface for creating plots and visualizations.
import matplotlib.pyplot as plt
# Package: DictVectorizer 
# Purpose: Convert a collection of dictionaries into a matrix representation
from sklearn.feature_extraction import DictVectorizer  

# Package: LinearRegression
# Purpose: Implement linear regression models
from sklearn.linear_model import LinearRegression  

# Package: Lasso
# Purpose: Implement Lasso regression models
from sklearn.linear_model import Lasso  

# Package: Ridge
# Purpose: Implement Ridge regression models
from sklearn.linear_model import Ridge  

# Package: mean_squared_error
# Purpose: Calculate the mean squared error between two arrays
from sklearn.metrics import mean_squared_error  
# Package: mlflow
# Purpose: MLflow is an open-source platform for managing the end-to-end machine learning lifecycle.

import mlflow
# Package: pyspark
# Purpose: PySpark is the Python API for Apache Spark. Apache Spark is a fast and general-purpose cluster computing system.

import pyspark

#Package: xgboost
#Purpose: XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable.

import xgboost as xgb

#Package: hyperopt
#Purpose: Hyperopt is a Python library for serial and parallel optimization over awkward search spaces, which may include real-valued, discrete, and conditional dimensions.

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import SparkTrials
from hyperopt.pyll import scope

In [13]:
# Set tracking URI

mlflow.set_tracking_uri("http://13.53.123.37:5000")

# Set experiment 

mlflow.set_experiment("nyc_taxi_experiment")

<Experiment: artifact_location='s3://mlflow-artifacts-remote-91/1', creation_time=1716207412509, experiment_id='1', last_update_time=1716207412509, lifecycle_stage='active', name='nyc_taxi_experiment', tags={}>

### Preparation

- CD into ~/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/
- Create new notebbook "homework_module_2"
- Create Data folder 
- CD into data folder
- Download Jan, Feb, March 2023 Green Taxi data 

### Question 1

In [3]:
!mlflow --version

mlflow, version 2.13.0


### Question 2

In [9]:
!python preprocess_data.py --raw_data_path ~/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/data --dest_path ./output

In [10]:
!ls ~/mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/output

dv.pkl	test.pkl  train.pkl  val.pkl


4 files were added 

### Question 3

In [31]:
%run train.py --data_path ./output



#### Testing the script as a notebook cell

In [29]:
import os
import pickle
import mlflow  # Added import for mlflow
import mlflow.sklearn  # Added import for mlflow.sklearn

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Set tracking URI
mlflow.set_tracking_uri("http://13.53.123.37:5000")

# Set experiment
mlflow.set_experiment("nyc_taxi_experiment")


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_train(data_path: str):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    with mlflow.start_run():  # Added MLflow start run context manager
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Log model and parameters
        mlflow.log_param("max_depth", 10)  # Log the max_depth parameter
        mlflow.log_param("random_state", 0)  # Log the random_state parameter
        mlflow.log_metric("rmse", rmse)  # Log the RMSE metric
        mlflow.sklearn.log_model(rf, "model")  # Log the trained model
        mlflow.log_param("min_samples_split", rf.min_samples_split)  # Log the min_samples_split parameter

    print(f"RMSE: {rmse}")


# Run the training function
run_train(data_path="./output")




RMSE: 5.431162180141208


### Questions 4

# You also need to pass "default-artifact-root"

$ mlflow server \
    --backend-store-uri sqlite:///mlruns/mlflow.db \
    --default-artifact-root ./artifacts \
    --host 0.0.0.0 \
    --port 5000

### Question 5

In [33]:
%run hpo.py

  7%|▋         | 1/15 [00:10<02:32, 10.92s/trial, best loss: 5.370086069268862]




 13%|█▎        | 2/15 [00:11<01:05,  5.06s/trial, best loss: 5.370086069268862]




 20%|██        | 3/15 [00:12<00:36,  3.08s/trial, best loss: 5.370086069268862]




 27%|██▋       | 4/15 [00:20<00:53,  4.82s/trial, best loss: 5.357490752366866]




 33%|███▎      | 5/15 [00:23<00:43,  4.32s/trial, best loss: 5.357490752366866]




 40%|████      | 6/15 [00:36<01:05,  7.30s/trial, best loss: 5.354695072530291]




 47%|████▋     | 7/15 [00:49<01:13,  9.22s/trial, best loss: 5.354695072530291]




 53%|█████▎    | 8/15 [00:51<00:46,  6.70s/trial, best loss: 5.354695072530291]




 60%|██████    | 9/15 [00:59<00:42,  7.16s/trial, best loss: 5.354695072530291]




 67%|██████▋   | 10/15 [01:05<00:35,  7.00s/trial, best loss: 5.354695072530291]




 73%|███████▎  | 11/15 [01:11<00:25,  6.48s/trial, best loss: 5.335419588556921]




 80%|████████  | 12/15 [01:14<00:16,  5.62s/trial, best loss: 5.335419588556921]




 87%|████████▋ | 13/15 [01:16<00:08,  4.29s/trial, best loss: 5.335419588556921]




 93%|█████████▎| 14/15 [01:20<00:04,  4.46s/trial, best loss: 5.335419588556921]




100%|██████████| 15/15 [01:28<00:00,  5.87s/trial, best loss: 5.335419588556921]





### Question 6

Register the best model on the test set 

In [2]:
%run register_model.py --data_path ./output --top_n 5

Registered model 'random-forest-regressor' already exists. Creating a new version of this model...
2024/05/25 17:53:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-regressor, version 2
Created version '2' of model 'random-forest-regressor'.


In [None]:
%run train.py --data_path ./output

