In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

## Preprocessing, Modeling, Evaluation
The base steps followed in any data modeling pipelines are:
               - pre-processing 
               - suitable model selection
               - modeling
               - hyperparamaters tunning using GridSearchCV
               - evaluation

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

%load_ext autoreload
%autoreload 2

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression

from scipy.sparse import csr_matrix
import mlflow
from mlflow.tracking import MlflowClient
from typing import List, Tuple, Dict, Union
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer

In [3]:
DATA_PATH = '../data'
TRAIN_PATH = Path(DATA_PATH, 'abalone_train.csv')
TEST_PATH = Path(DATA_PATH, 'abalone_test.csv')
PREDICT_PATH = Path(DATA_PATH, 'abalone_predict.csv')


In [4]:
def load_data(path: str) -> pd.DataFrame:
    """
    Load data from a CSV file and drop the 'Unnamed: 0' column.
    
    Parameters:
    - path (str): Path to the CSV file.
    
    Returns:
    - pd.DataFrame: Loaded dataframe with 'Unnamed: 0' column dropped.
    """
    df = pd.read_csv(path)
    return df.drop('Unnamed: 0', axis=1)

def compute_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute the 'Age' column based on the 'Rings' column and drop the 'Rings' column.
    
    Parameters:
    - df (pd.DataFrame): Input dataframe.
    
    Returns:
    - pd.DataFrame: Dataframe with 'Age' column computed and 'Rings' column dropped.
    """
    df['Age'] = df['Rings'] + 1.5
    return df.drop('Rings', axis=1)


def encode_categorical_cols(
    df: pd.DataFrame, categorical_cols: List[str] = None
) -> pd.DataFrame:
    """
    Encode categorical columns as strings.
    
    Parameters:
    - df (pd.DataFrame): Input dataframe.
    - categorical_cols (List[str]): List of categorical columns to encode.
    
    Returns:
    - pd.DataFrame: Dataframe with categorical columns encoded.
    """
    if categorical_cols is None:
        categorical_cols = ["Sex"]
    df[categorical_cols] = df[categorical_cols].astype("str")
    return df

def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> Tuple[csr_matrix, Union[np.ndarray, None], DictVectorizer]:
    """
    Extract features and target from the dataframe.
    
    Parameters:
    - df (pd.DataFrame): Input dataframe.
    - categorical_cols (List[str]): List of categorical columns.
    - dv (DictVectorizer): DictVectorizer instance.
    - with_target (bool): Whether to extract target or not.
    
    Returns:
    - Tuple: Features matrix, target array, and DictVectorizer instance.
    """
    if categorical_cols is None:
        categorical_cols = ["Sex"]
    dicts = df[categorical_cols].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["Age"].values

    x = dv.transform(dicts)
    return x, y, dv

def train_model(x_train: csr_matrix, y_train: np.ndarray) -> LinearRegression:
    """
    Train a linear regression model.
    
    Parameters:
    - x_train (csr_matrix): Features matrix.
    - y_train (np.ndarray): Target array.
    
    Returns:
    - LinearRegression: Trained linear regression model.
    """
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    return lr

def predict_age(input_data: csr_matrix, model: LinearRegression) -> np.ndarray:
    """
    Predict using the trained model.
    
    Parameters:
    - input_data (csr_matrix): Input features matrix.
    - model (LinearRegression): Trained linear regression model.
    
    Returns:
    - np.ndarray: Predicted values.
    """
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Evaluate the model using root mean squared error.
    
    Parameters:
    - y_true (np.ndarray): True target values.
    - y_pred (np.ndarray): Predicted target values.
    
    Returns:
    - float: Root mean squared error.
    """
    return mean_squared_error(y_true, y_pred, squared=False)

In [8]:
# Load data
train_df = load_data(TRAIN_PATH)
test_df = load_data(TEST_PATH)

# Compute target
train_df = compute_target(train_df)
test_df = compute_target(test_df)

# Encode categorical columns
train_df = encode_categorical_cols(train_df)
test_df = encode_categorical_cols(test_df)

# Extract X and y
X_train, y_train, dv = extract_x_y(train_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

# Train model
model = train_model(X_train, y_train)

# Evaluate model
prediction = predict_age(X_train, model)
train_me = evaluate_model(y_train, prediction)

# Evaluate model on test set
y_pred_test = predict_age(X_test, model)
test_me = evaluate_model(y_test, y_pred_test)

In [26]:
mlflow_experiment_path = f"/mlflow/linear_reg_test"
mlflow.set_experiment(mlflow_experiment_path)

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("Level", "Development")
    mlflow.set_tag("Team", "Data Science")

    # Load data
    train_df = load_data(TRAIN_PATH)
    test_df = load_data(TEST_PATH)

    mlflow.log_param("train_set_size", train_df.shape[0])
    mlflow.log_param("test_set_size", test_df.shape[0])

    # Compute target
    train_df = compute_target(train_df)

    # Encode categorical columns
    train_df = encode_categorical_cols(train_df)

    # Extract X and y
    X_train, y_train, dv = extract_x_y(train_df)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_age(X_train, model)
    train_me = evaluate_model(y_train, prediction)
    
    mlflow.log_metric("train_me", train_me)

    # Evaluate model on test set
    test_df = compute_target(test_df)
    test_df = encode_categorical_cols(test_df)
    X_test, y_test, _ = extract_x_y(test_df, dv=dv)
    y_pred_test = predict_age(X_test, model)
    test_me = evaluate_model(y_test, y_pred_test)
    mlflow.log_metric("test_me", test_me)

    # Log your model
    mlflow.sklearn.log_model(model, "models")

    # Register your model as the production model
    mlflow.register_model(f"runs:/{run_id}/models", "linear_reg_test")

Successfully registered model 'linear_reg_test'.
2023/10/23 09:27:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: linear_reg_test, version 1
Created version '1' of model 'linear_reg_test'.


In [27]:
!mlflow ui --host localhost --port 5002

[2023-10-23 09:27:58 +0200] [158179] [INFO] Starting gunicorn 21.2.0
[2023-10-23 09:27:58 +0200] [158179] [INFO] Listening at: http://127.0.0.1:5002 (158179)
[2023-10-23 09:27:58 +0200] [158179] [INFO] Using worker: sync
[2023-10-23 09:27:58 +0200] [158183] [INFO] Booting worker with pid: 158183
[2023-10-23 09:27:58 +0200] [158184] [INFO] Booting worker with pid: 158184
[2023-10-23 09:27:58 +0200] [158185] [INFO] Booting worker with pid: 158185
[2023-10-23 09:27:58 +0200] [158186] [INFO] Booting worker with pid: 158186
^C
[2023-10-23 09:29:12 +0200] [158179] [INFO] Handling signal: int
[2023-10-23 09:29:12 +0200] [158183] [INFO] Worker exiting (pid: 158183)
[2023-10-23 09:29:12 +0200] [158184] [INFO] Worker exiting (pid: 158184)
[2023-10-23 09:29:12 +0200] [158186] [INFO] Worker exiting (pid: 158186)
[2023-10-23 09:29:12 +0200] [158185] [INFO] Worker exiting (pid: 158185)
