## MLflow 5 minute Tracking Quickstart

This notebook demonstrates using a local MLflow Tracking Server to log, register, and then load a model as a generic Python Function (pyfunc) to perform inference on a Pandas dfFrame.

Throughout this notebook, we'll be using the MLflow fluent API to perform all interactions with the MLflow Tracking Server.

In [21]:
!pip install mlflow

Defaulting to user installation because normal site-packages is not writeable
[0m

In [22]:
!pip install pandas
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0m

In [23]:
!pip show mlflow

Name: mlflow
Version: 2.9.1
Summary: MLflow: A Platform for ML Development and Productionization
Home-page: https://mlflow.org/
Author: Databricks
Author-email: 
License: Apache License 2.0
Location: /home/tarik/.local/lib/python3.10/site-packages
Requires: alembic, click, cloudpickle, databricks-cli, docker, entrypoints, Flask, gitpython, gunicorn, importlib-metadata, Jinja2, markdown, matplotlib, numpy, packaging, pandas, protobuf, pyarrow, pytz, pyyaml, querystring-parser, requests, scikit-learn, scipy, sqlalchemy, sqlparse
Required-by: 


In [24]:
!pip show seaborn

Name: seaborn
Version: 0.12.2
Summary: Statistical data visualization
Home-page: 
Author: 
Author-email: Michael Waskom <mwaskom@gmail.com>
License: 
Location: /home/tarik/.local/lib/python3.10/site-packages
Requires: matplotlib, numpy, pandas
Required-by: 


In [25]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline  # Import Pipeline class
import joblib



### Preprocessing csv

In [26]:
# Load your dataset
df = pd.read_csv('data/2020b_Building_Energy_Benchmarking.csv', sep=',')

# df.dropna(axis=0, inplace=True)
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace("(", "")
df.columns = df.columns.str.replace(")", "")
df.columns


if 'yearsenergystarcertified' in df.columns and 'outlier' in df.columns:
    df.drop(['yearsenergystarcertified', 'outlier'], axis=1, inplace=True)

if 'compliancestatus' in df.columns:
    # Filter the dfFrame to keep only df with Compliant in ComplianceStatus
    df = df[df["compliancestatus"] == 'Compliant']
# Drop the column after check only compliance in compliancesstatus
df.drop(['compliancestatus'], axis=1, inplace=True)


# Filter the dfFrame to keep only rows where siteenergyusekbtu is not null
df = df[df["siteenergyusekbtu"].notnull()]
# fill Nan Null with np.nan
df = df.fillna(np.nan)
# # Replace "NULL" with np.nan in your df
# df = df.replace("NULL", np.nan).replace("NA", np.nan)
# Replace "NULL" with np.nan in your df
df = df.replace("NA", np.nan)

# Add column is elec etc
# Create new columns with 1 or 0 based on conditions
df['is_using_steamusekWh'] = np.where(df['steamusekbtu'] > 0, 1, 0)
df['is_using_electricitykWh'] = np.where(df['electricitykbtu'] > 0, 1, 0)
df['is_using_naturalgaskWh'] = np.where(df['naturalgaskbtu'] > 0, 1, 0)

# filter column
selected_columns = ["siteenergyusekbtu", 'totalghgemissions','yearbuilt','is_using_electricitykWh', 'is_using_naturalgaskWh', 'is_using_steamusekWh', 'largestpropertyusetypegfa', 'numberofbuildings', 'numberoffloors', 'propertygfabuildings','buildingtype', 'primarypropertytype']

# Filter the DataFrame to select only the desired columns
df = df[selected_columns]

# drop nan 
df.dropna(axis=0, inplace=True)

coerced_columns = {
    'yearbuilt': float,
    'numberoffloors': float,
    'propertygfabuildings': float,
    'numberofbuildings': float,
    'is_using_electricitykWh': int,
    'is_using_naturalgaskWh': int,
    'is_using_steamusekWh': int,
}

df = df.astype(coerced_columns)


# save result as csv
df.to_csv("data/dataset_2020b.csv", sep=",", index=False)

    

In [27]:
df.shape

(3196, 12)

In [28]:
df.isna().sum()

siteenergyusekbtu            0
totalghgemissions            0
yearbuilt                    0
is_using_electricitykWh      0
is_using_naturalgaskWh       0
is_using_steamusekWh         0
largestpropertyusetypegfa    0
numberofbuildings            0
numberoffloors               0
propertygfabuildings         0
buildingtype                 0
primarypropertytype          0
dtype: int64

In [29]:
column_types = df.dtypes
print(column_types)

siteenergyusekbtu            float64
totalghgemissions            float64
yearbuilt                    float64
is_using_electricitykWh        int64
is_using_naturalgaskWh         int64
is_using_steamusekWh           int64
largestpropertyusetypegfa    float64
numberofbuildings            float64
numberoffloors               float64
propertygfabuildings         float64
buildingtype                  object
primarypropertytype           object
dtype: object


In [30]:
# Load your dataset complet for dtypes
df_2016 = pd.read_csv('data/dataset_2016.csv', sep=',')
column_types = df.dtypes
print(column_types)

siteenergyusekbtu            float64
totalghgemissions            float64
yearbuilt                    float64
is_using_electricitykWh        int64
is_using_naturalgaskWh         int64
is_using_steamusekWh           int64
largestpropertyusetypegfa    float64
numberofbuildings            float64
numberoffloors               float64
propertygfabuildings         float64
buildingtype                  object
primarypropertytype           object
dtype: object


In [31]:
# Load your dataset complet for dtypes
df_2020 = pd.read_csv('data/dataset_2020.csv', sep=',')
column_types = df.dtypes
print(column_types)

siteenergyusekbtu            float64
totalghgemissions            float64
yearbuilt                    float64
is_using_electricitykWh        int64
is_using_naturalgaskWh         int64
is_using_steamusekWh           int64
largestpropertyusetypegfa    float64
numberofbuildings            float64
numberoffloors               float64
propertygfabuildings         float64
buildingtype                  object
primarypropertytype           object
dtype: object


In [32]:
# Assuming df_2016 and df_2020 are your two DataFrames
comparison_result = df_2016.compare(df_2020)


# Display the rows with differences
print(comparison_result)

     yearbuilt        
          self   other
3156    2015.0  2020.0
3157    2015.0  2020.0
3158    2015.0  2020.0
3161    2015.0  2020.0
3166    2015.0  2020.0
3167    2015.0  2020.0
3168    2015.0  2020.0
3169    2015.0  2020.0
3170    2015.0  2020.0
3171    2015.0  2020.0


## Load our saved model as a Python Function

Although we can load our model back as a native scikit-learn format with `mlflow.sklearn.load_model()`, below we are loading the model as a generic Python Function, which is how this model would be loaded for online model serving. We can still use the `pyfunc` representation for batch use cases, though, as is shown below.

### Set the MLflow Tracking URI 
In this step, we're going to use the local MLflow tracking server that we started. 

If you chose to define a different port when starting the server, apply that port to the following cell. 

In [33]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8090")

## Load training data and train a simple model

For our quickstart, we're going to be using the familiar iris dataset that is included in scikit-learn. Following the split of the data, we're going to train a simple logistic regression classifier on the training data and calculate some error metrics on our holdout test data. 

Note that the only MLflow-related activities in this portion are around the fact that we're using a `param` dictionary to supply our model's hyperparameters; this is to make logging these settings easier when we're ready to log our model and its associated metadata.

In [34]:
# Load your dataset
df = pd.read_csv('data/dataset_sample.csv', sep=',')
df.dropna(axis=0, inplace=True)
df.columns = df.columns.str.lower()
df.columns
print(df.columns)
# Separate your target variables from features
X = df.drop(["siteenergyusekbtu", 'totalghgemissions'], axis=1)
Y = df[["siteenergyusekbtu", 'totalghgemissions']]

# Define column transformers for numeric and categorical features
numeric_features = ['yearbuilt','is_using_electricitykwh', 'is_using_naturalgaskwh', 'is_using_steamusekwh', 'largestpropertyusetypegfa', 'numberofbuildings', 'numberoffloors', 'propertygfabuildings']
categorical_features = ['buildingtype', 'primarypropertytype']


numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing and modeling into a single pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MultiOutputRegressor(GradientBoostingRegressor()))
])

# Split your dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define your model hyperparameters
model_params = {
    'model__estimator__loss': 'huber',
    'model__estimator__n_estimators': 500,
    'model__estimator__max_depth': 5,
    'model__estimator__learning_rate': 0.01
}

# Train your model
pipeline.set_params(**model_params)
pipeline.fit(X_train, Y_train)  # Fit the model

# Predict on the test set
Y_pred = pipeline.predict(X_test)

# Calculate evaluation metrics
r2_score_test = r2_score(Y_test, Y_pred)
mae_test_score = mean_absolute_error(Y_test, Y_pred)


Index(['siteenergyusekbtu', 'totalghgemissions', 'yearbuilt',
       'is_using_electricitykwh', 'is_using_naturalgaskwh',
       'is_using_steamusekwh', 'largestpropertyusetypegfa',
       'numberofbuildings', 'numberoffloors', 'propertygfabuildings',
       'buildingtype', 'primarypropertytype'],
      dtype='object')


In [35]:
column_types = df.dtypes
print(column_types)

siteenergyusekbtu            float64
totalghgemissions            float64
yearbuilt                    float64
is_using_electricitykwh        int64
is_using_naturalgaskwh         int64
is_using_steamusekwh           int64
largestpropertyusetypegfa    float64
numberofbuildings            float64
numberoffloors               float64
propertygfabuildings         float64
buildingtype                  object
primarypropertytype           object
dtype: object


## Define an MLflow Experiment

In order to group any distinct runs of a particular project or idea together, we can define an Experiment that will group each iteration (runs) together. 
Defining a unique name that is relevant to what we're working on helps with organization and reduces the amount of work (searching) to find our runs later on. 

In [36]:
# Log the experiment in MLflow
mlflow.set_experiment("Seatle_co2_pred_maud_tarik_2")


<Experiment: artifact_location='mlflow-artifacts:/14', creation_time=1702395467376, experiment_id='14', last_update_time=1702395467376, lifecycle_stage='active', name='Seatle_co2_pred_maud_tarik_2', tags={}>

## Log the model, hyperparameters, and loss metrics to MLflow.

In order to record our model and the hyperparameters that were used when fitting the model, as well as the metrics associated with validating the fit model upon holdout data, we initiate a run context, as shown below. Within the scope of that context, any fluent API that we call (such as `mlflow.log_params()` or `mlflow.sklearn.log_model()`) will be associated and logged together to the same run. 

In [37]:
with mlflow.start_run():
    # Log model hyperparameters
    mlflow.log_params(model_params)

    # Log evaluation metrics
    mlflow.log_metric("R2_score_test", r2_score_test)
    mlflow.log_metric("MAE_test_score", mae_test_score)

    # Set tags for additional information
    mlflow.set_tag("Training Info", "GradientBoostingRegressor for your use case")

    # Infer the model signature
    signature = infer_signature(X_train, Y_pred)

    # Log the model
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="data/best_model_GradientBoostingRegressor.pkl",
        signature=signature,
        input_example=X_train,
        registered_model_name="data/model_tracking",
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'data/model_tracking' already exists. Creating a new version of this model...
2023/12/13 09:09:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: data/model_tracking, version 4
Created version '4' of model 'data/model_tracking'.


In [38]:
# Load the model and make predictions
loaded_model = joblib.load("data/best_model_GradientBoostingRegressor.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Use our model to predict the iris class type on a Pandas DataFrame

In [39]:
# Load your dataset
df = pd.read_csv('data/dataset_2020.csv', sep=',')
df.dropna(axis=0, inplace=True)
df.columns = df.columns.str.lower()
# Define the mapping of old column names to new column names
column_name_mapping = {
    'is_using_electricitykwh': 'is_using_electricitykWh',
    'is_using_naturalgaskwh': 'is_using_naturalgaskWh',
    'is_using_steamusekwh': 'is_using_steamusekWh'
}

# Rename the columns in the DataFrame
df.rename(columns=column_name_mapping, inplace=True)

print(df.columns)
# Separate your target variables from features
X_2020 = df.drop(["siteenergyusekbtu", 'totalghgemissions'], axis=1)
Y_2020 = df[["siteenergyusekbtu", 'totalghgemissions']]

Index(['siteenergyusekbtu', 'totalghgemissions', 'yearbuilt',
       'is_using_electricitykWh', 'is_using_naturalgaskWh',
       'is_using_steamusekWh', 'largestpropertyusetypegfa',
       'numberofbuildings', 'numberoffloors', 'propertygfabuildings',
       'buildingtype', 'primarypropertytype'],
      dtype='object')


In [40]:
X_2020.columns

Index(['yearbuilt', 'is_using_electricitykWh', 'is_using_naturalgaskWh',
       'is_using_steamusekWh', 'largestpropertyusetypegfa',
       'numberofbuildings', 'numberoffloors', 'propertygfabuildings',
       'buildingtype', 'primarypropertytype'],
      dtype='object')

In [41]:
predictions = loaded_model.predict(X_2020)

# Display the results
print("R2 score on test data:", r2_score_test)
print("MAE on test data:", mae_test_score)
print("Predictions:", predictions)

R2 score on test data: 0.8801043109858808
MAE on test data: 1622054.09953706
Predictions: [[6.46835656e+06 2.95572270e+02]
 [6.79266062e+06 2.19370201e+02]
 [7.24977561e+07 1.98145561e+03]
 ...
 [3.46600136e+06 1.18387389e+02]
 [3.22734684e+06 1.46312352e+02]
 [2.44527715e+06 5.83881132e+01]]


In [42]:
# List of strings to check
strings_to_check = ['is_using_steamusekwh', 'is_using_electricitykwh', 'is_using_naturalgaskwh']

# Check if all strings in the list exist in the DataFrame columns
if all(col in X_2020.columns for col in strings_to_check):
    print("All strings exist in the DataFrame columns.")
else:
    print("Not all strings exist in the DataFrame columns.")

Not all strings exist in the DataFrame columns.
