In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

## Preprocessing, Modeling, Evaluation
The base steps followed in any data modeling pipelines are:
               - pre-processing 
               - suitable model selection
               - modeling
               - hyperparamaters tunning using GridSearchCV
               - evaluation

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

%load_ext autoreload
%autoreload 2

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline

from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import  train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import  RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.linear_model import  Ridge
from sklearn.svm import SVR

from scipy.sparse import csr_matrix
import mlflow
from mlflow.tracking import MlflowClient
from typing import List
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer

import os
print(os.listdir("../data"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['abalone_train.csv:Zone.Identifier', 'abalone.csv:Zone.Identifier', 'abalone.csv', '.gitkeep', 'abalone_test.csv:Zone.Identifier', 'abalone_predict.csv', 'abalone_train.csv', 'abalone_test.csv', 'abalone_predict.csv:Zone.Identifier']


In [6]:
# Read the dataset 
pd.set_option('display.max_columns', 500)
data = pd.read_csv('../data/abalone.csv')
data['age'] = data['Rings']+1.5
data.drop('Rings', axis = 1, inplace = True)

In [13]:
DATA_PATH = '../data'
TRAIN_PATH = Path(DATA_PATH, 'abalone_train.csv')
TEST_PATH = Path(DATA_PATH, 'abalone_test.csv')

pd.read_csv(TEST_PATH).drop('Unnamed: 0', axis=1)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,I,0.400,0.315,0.090,0.3300,0.1510,0.0680,0.0800,6
1,F,0.655,0.525,0.190,1.3595,0.5640,0.3215,0.3985,10
2,F,0.700,0.525,0.190,1.6015,0.7070,0.3650,0.4300,10
3,F,0.610,0.485,0.165,1.0870,0.4255,0.2320,0.3800,11
4,M,0.750,0.555,0.215,2.2010,1.0615,0.5235,0.5285,11
...,...,...,...,...,...,...,...,...,...
830,F,0.540,0.425,0.160,0.9455,0.3675,0.2005,0.2950,9
831,F,0.610,0.480,0.160,1.2340,0.5980,0.2380,0.3150,12
832,F,0.595,0.475,0.160,1.1405,0.5470,0.2310,0.2710,6
833,F,0.635,0.510,0.185,1.2860,0.5260,0.2950,0.4105,12


In [25]:
DATA_PATH = '../data'
TRAIN_PATH = Path(DATA_PATH, 'abalone_train.csv')
TEST_PATH = Path(DATA_PATH, 'abalone_test.csv')


def load_data(path: str):
    df = pd.read_csv(path)
    return df.drop('Unnamed: 0', axis=1)

def compute_target(
    df: pd.DataFrame,
    # pickup_column: str = "tpep_pickup_datetime",
    # dropoff_column: str = "tpep_dropoff_datetime",
) -> pd.DataFrame:
    df['Age'] = df['Rings'] + 1.5
    return df.drop('Rings', axis=1)

# MIN_DURATION = 1
# MAX_DURATION = 60
def filter_outliers(
    df: pd.DataFrame, min_duration: int = 1, max_duration: int = 60
) -> pd.DataFrame:
    return df[df["duration"].between(min_duration, max_duration)]

CATEGORICAL_COLS = ["PUlocationID", "DOlocationID"]


def encode_categorical_cols(
    df: pd.DataFrame, categorical_cols: List[str] = None
) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["Sex"]
    df[categorical_cols] = df[categorical_cols].astype("str")
    return df

def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:

    if categorical_cols is None:
        categorical_cols = ["Sex"]
    dicts = df[categorical_cols].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["Age"].values

    x = dv.transform(dicts)
    return x, y, dv

def train_model(x_train: csr_matrix, y_train: np.ndarray):
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    return lr

def predict_duration(input_data: csr_matrix, model: LinearRegression):
    return model.predict(input_data)


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return mean_squared_error(y_true, y_pred, squared=False)

In [24]:
train_df = load_data(TRAIN_PATH)
test_df = load_data(TEST_PATH)

# Compute target
train_df['Age'] = train_df['Rings'] + 1.5

# train_df = compute_target(train_df)
# train_df

In [26]:
mlflow_experiment_path = f"/mlflow/linear_reg_test"
mlflow.set_experiment(mlflow_experiment_path)

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("Level", "Development")
    mlflow.set_tag("Team", "Data Science")

    # Load data
    train_df = load_data(TRAIN_PATH)
    test_df = load_data(TEST_PATH)

    mlflow.log_param("train_set_size", train_df.shape[0])
    mlflow.log_param("test_set_size", test_df.shape[0])

    # Compute target
    train_df = compute_target(train_df)

    # Encode categorical columns
    train_df = encode_categorical_cols(train_df)

    # Extract X and y
    X_train, y_train, dv = extract_x_y(train_df)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_duration(X_train, model)
    train_me = evaluate_model(y_train, prediction)
    
    mlflow.log_metric("train_me", train_me)

    # Evaluate model on test set
    test_df = compute_target(test_df)
    test_df = encode_categorical_cols(test_df)
    X_test, y_test, _ = extract_x_y(test_df, dv=dv)
    y_pred_test = predict_duration(X_test, model)
    test_me = evaluate_model(y_test, y_pred_test)
    mlflow.log_metric("test_me", test_me)

    # Log your model
    mlflow.sklearn.log_model(model, "models")

    # Register your model as the production model
    mlflow.register_model(f"runs:/{run_id}/models", "linear_reg_test")

Successfully registered model 'linear_reg_test'.
2023/10/23 09:27:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: linear_reg_test, version 1
Created version '1' of model 'linear_reg_test'.


In [27]:
!mlflow ui --host localhost --port 5002


[2023-10-23 09:27:58 +0200] [158179] [INFO] Starting gunicorn 21.2.0
[2023-10-23 09:27:58 +0200] [158179] [INFO] Listening at: http://127.0.0.1:5002 (158179)
[2023-10-23 09:27:58 +0200] [158179] [INFO] Using worker: sync
[2023-10-23 09:27:58 +0200] [158183] [INFO] Booting worker with pid: 158183
[2023-10-23 09:27:58 +0200] [158184] [INFO] Booting worker with pid: 158184
[2023-10-23 09:27:58 +0200] [158185] [INFO] Booting worker with pid: 158185
[2023-10-23 09:27:58 +0200] [158186] [INFO] Booting worker with pid: 158186
^C
[2023-10-23 09:29:12 +0200] [158179] [INFO] Handling signal: int
[2023-10-23 09:29:12 +0200] [158183] [INFO] Worker exiting (pid: 158183)
[2023-10-23 09:29:12 +0200] [158184] [INFO] Worker exiting (pid: 158184)
[2023-10-23 09:29:12 +0200] [158186] [INFO] Worker exiting (pid: 158186)
[2023-10-23 09:29:12 +0200] [158185] [INFO] Worker exiting (pid: 158185)
