# EDA (Exploratory Data Analysis) of the dataset

In this notebook, explore the Abalone dataset.

Add any relevant insight for future modelling.

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import os


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [32]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.0-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.0-py3-none-win_amd64.whl (99.7 MB)
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ----------

In [42]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import roc_auc_score

# Data

In [2]:
DATA_FOLDER = "../data"
df_train = pd.read_csv(os.path.join(DATA_FOLDER, "abalone.csv"))

In [12]:
data_path = "../data/abalone.csv"

# EDA

In [3]:
df_train.head()


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
df_train.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

In [21]:
# Define categorical columns
categorical_columns = ['Sex']

# Split the data into features and target
X = df_train.drop('Rings', axis=1)
y = df_train['Rings'] + 1.5

# Create a column transformer to encode categorical columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder='passthrough'  # Pass through the numeric columns
)


# Create a pipeline that first preprocesses the data and then applies Linear Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model (you can use different metrics)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 4.885673255441292


## Encode in functions

In [46]:
DATA_FOLDER = "../data"

def load_data(path: str):
    return pd.read_csv(os.path.join(DATA_FOLDER, "abalone.csv"))

def compute_target(df: pd.DataFrame):
    return df['Rings'] + 1.5

def extract_x_y(df):
    X = df.drop('Rings', axis=1)
    y = compute_target(df)
    return X, y

def define_pipeline(cat_cols : list, Regressor):
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, cat_cols)
        ],
        
        remainder='passthrough'  # Pass through the numeric columns
    )

    # Create a pipeline that first preprocesses the data and then applies Linear Regression
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Regressor())
    ])
    
    return pipeline

def train_model(pipeline: Pipeline, x_train: pd.DataFrame, y_train: pd.DataFrame):
    model = pipeline.fit(x_train, y_train)
    return model


def predict_age(input_data: pd.DataFrame, model: Pipeline):
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return mean_squared_error(y_true, y_pred, squared=False)

In [9]:
import mlflow

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'file:///c:/Users/matha/OneDrive/Documents/HEC/Cours/ML_Ops/xhec-mlops-project-student/notebooks/mlruns'


In [10]:
from mlflow.tracking import MlflowClient

In [48]:
# Set the experiment name
mlflow.set_experiment("Abalone_age_prediction")
# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id
    # Set tags for the run
    ...
    # Load data
    df = load_data(data_path)
    # train test split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    # Extract X and y
    X_train, y_train = extract_x_y(train_df)
    # Define pipeline 
    pipeline = define_pipeline(['Sex'], KNeighborsRegressor)
    # Train model
    model = train_model(pipeline, X_train, y_train)
    # Evaluate model
    prediction = predict_age(X_train, model)
    train_me = evaluate_model(y_train, prediction)
    # Evaluate model on test set
    X_test, y_test = extract_x_y(test_df)
    y_pred_test = predict_age(X_test, model)
    test_me = evaluate_model(y_test, y_pred_test)
    # Log your model
    run_id = run.info.run_id #permet de savoir ou chercher le modèle
    # MLflow log param
    mlflow.log_metric('test_me', test_me)
    mlflow.sklearn.log_model(model, "model.pkl")
    mlflow.register_model(f"runs:/{run_id}/models", "Abalone_age_prediction")

Registered model 'Abalone_age_prediction' already exists. Creating a new version of this model...
2023/10/23 13:54:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Abalone_age_prediction, version 6
Created version '6' of model 'Abalone_age_prediction'.


In [29]:
client = MlflowClient()
experiments = client.search_experiments()
experiments

[<Experiment: artifact_location='file:///c:/Users/matha/OneDrive/Documents/HEC/Cours/ML_Ops/xhec-mlops-project-student/notebooks/mlruns/562039787703191527', creation_time=1698058712420, experiment_id='562039787703191527', last_update_time=1698058712420, lifecycle_stage='active', name='Abalone_age_prediction', tags={}>,
 <Experiment: artifact_location='file:///c:/Users/matha/OneDrive/Documents/HEC/Cours/ML_Ops/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1698058712387, experiment_id='0', last_update_time=1698058712387, lifecycle_stage='active', name='Default', tags={}>]

In [30]:
!mlflow ui --host 0.0.0.0 --port 5002

^C
