In [1]:
# ! pip install --upgrade mlflow

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import pandas as pd
import numpy as np
import mlflow

import os
import sys
import warnings
from six.moves import urllib
import tarfile


In [3]:
def fetch_housing_data():
    '''Fetch the data from a source & store it in the local repo by unzipping'''

    DOWNLOAD_ROOT = (
        "https://raw.githubusercontent.com/ageron/handson-ml/master/"
    )
    housing_path = os.path.join("datasets", "housing")
    housing_url = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:
fetch_housing_data()

In [5]:
data_path = "datasets/housing/housing.csv"

data = pd.read_csv(data_path)

data.sample(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
18713,-122.45,40.56,17.0,1712.0,307.0,963.0,329.0,3.9375,148700.0,INLAND
399,-122.29,37.89,52.0,2269.0,380.0,1004.0,371.0,5.1696,261400.0,NEAR BAY
15967,-122.4,37.72,40.0,1948.0,413.0,1434.0,396.0,3.0313,219100.0,NEAR BAY
17914,-121.99,37.36,32.0,1754.0,324.0,917.0,330.0,4.6761,298300.0,<1H OCEAN
17557,-121.89,37.33,42.0,1279.0,358.0,1254.0,340.0,2.2583,192500.0,<1H OCEAN
13243,-117.65,34.14,9.0,3877.0,490.0,1815.0,490.0,8.4839,406700.0,INLAND
15566,-117.05,33.1,13.0,5516.0,746.0,2119.0,662.0,6.1868,320200.0,<1H OCEAN
18889,-122.23,38.1,47.0,1303.0,278.0,694.0,269.0,2.5969,92800.0,NEAR BAY
11490,-118.01,33.69,3.0,945.0,115.0,337.0,123.0,11.5199,500001.0,<1H OCEAN
827,-122.09,37.63,35.0,1213.0,221.0,790.0,243.0,4.7019,174100.0,NEAR BAY


In [6]:
data.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [7]:
data = data.drop(['ocean_proximity'], axis=1 )

In [8]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [9]:
print(data['median_house_value'].min(), data['median_house_value'].max())

14999.0 500001.0


<b> 1. Tracking experiments </b>

Tracking notes: MLFlow supports two types of backend stores: file store & database-backed store

<b> Artifact stores </b>

- Amazon S3
- Azure Blob Storage
- Google Cloud Storage
- FTP server
- NFS
- HDFS

Start the MLflow server by running the below command

mlflow server \
    --backend-store-uri mlruns/ \
    --default-artifact-root mlruns/ \
    --host 127.0.0.1 \
    --port 5000

In [10]:
remote_server_uri = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(remote_server_uri)

In [11]:
mlflow.tracking.get_tracking_uri()

'http://127.0.0.1:5000'

In [12]:
exp_name = "ElasticNet_Wine"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='/mnt/c/Users/aditya.das/Documents/demo-project/mlflow-training/mlruns/157034954202768703', creation_time=1720246995368, experiment_id='157034954202768703', last_update_time=1720246995368, lifecycle_stage='active', name='ElasticNet_Wine', tags={}>

In [13]:
def eval_metrics(actual, pred):
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, pred)
    r2 =  r2_score(actual, pred)

    return mse, rmse, mae, r2

In [14]:
def load_data(data_path):
    data = pd.read_csv(data_path)
    data = data.dropna()

    X = data.drop(['median_house_value', 'ocean_proximity'], axis=1)
    y = data['median_house_value']

    print(data.isna().sum())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    return X_train, X_test, y_train, y_test

In [15]:
# load_data(data_path)

In [25]:
def train(alpha=0.5, l1_ratio=0.5):
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    data_path = "datasets\housing\housing.csv"
    X_train, X_test, y_train, y_test = load_data(data_path)



    with mlflow.start_run():

        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        mse, rmse, mae, r2 = eval_metrics(y_test, pred)

        print(y_test, pred)

        print("Elastic Net model: ", alpha, l1_ratio)
        print("mse:", mse)
        print("rmse: ", rmse)
        print("mae: ", mae)
        print("r2_score:", r2)

        #Logging the parameter, metrics, & model to MLflow
        mlflow.log_param(key="alpha", value=alpha)
        mlflow.log_param(key="l1_ratio", value=l1_ratio)
        mlflow.log_metric(key="rmse", value=rmse)
        mlflow.log_metric(key="mae", value=mae)
        mlflow.log_metric(key="r2_score", value=r2)

        print("Save to: {}".format(mlflow.get_artifact_uri()))

        mlflow.sklearn.log_model(model, "model")
        # mlflow.sklearn.save_model()


In [18]:
# train()

In [19]:
# train(0.75, 0.75)

In [26]:
train(0.5, 0.3)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64
20230    395500.0
16928    237900.0
11644    241000.0
2670      72500.0
15578    167200.0
           ...   
14950    141700.0
12943    213500.0
4413     169200.0
4519     125000.0
7629     187900.0
Name: median_house_value, Length: 5109, dtype: float64 [245899.64164356 192520.03939497 261896.54742026 ... 193613.3850148
 137340.77130087 212084.5098276 ]
Elastic Net model:  0.5 0.3
mse: 5195692861.638649
rmse:  72081.15469135223
mae:  52780.2482112898
r2_score: 0.6133722542910762
Save to: /mnt/c/Users/aditya.das/Documents/demo-project/mlflow-training/mlruns/157034954202768703/c71aea83e3fe4fc3b80572470cc26634/artifacts
