# EDA (Exploratory Data Analysis) of the dataset

In this notebook, explore the Abalone dataset.

Add any relevant insight for future modelling.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import os


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
!pip install xgboost

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import roc_auc_score

# Data

In [None]:
DATA_FOLDER = "../data"
df_train = pd.read_csv(os.path.join(DATA_FOLDER, "abalone.csv"))

In [None]:
data_path = "../data/abalone.csv"

# EDA

In [None]:
df_train.head()


In [None]:
df_train.isna().sum()

In [None]:
# Define categorical columns
categorical_columns = ['Sex']

# Split the data into features and target
X = df_train.drop('Rings', axis=1)
y = df_train['Rings'] + 1.5

# Create a column transformer to encode categorical columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder='passthrough'  # Pass through the numeric columns
)


# Create a pipeline that first preprocesses the data and then applies Linear Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model (you can use different metrics)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


## Encode in functions

In [None]:
DATA_FOLDER = "../data"

def load_data(path: str):
    return pd.read_csv(os.path.join(DATA_FOLDER, "abalone.csv"))

def compute_target(df: pd.DataFrame):
    return df['Rings'] + 1.5

def extract_x_y(df):
    X = df.drop('Rings', axis=1)
    y = compute_target(df)
    return X, y

def define_pipeline(cat_cols : list, Regressor):
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, cat_cols)
        ],
        
        remainder='passthrough'  # Pass through the numeric columns
    )

    # Create a pipeline that first preprocesses the data and then applies Linear Regression
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Regressor())
    ])
    
    return pipeline

def train_model(pipeline: Pipeline, x_train: pd.DataFrame, y_train: pd.DataFrame):
    model = pipeline.fit(x_train, y_train)
    return model


def predict_age(input_data: pd.DataFrame, model: Pipeline):
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return mean_squared_error(y_true, y_pred, squared=False)

In [None]:
import mlflow

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

In [None]:
from mlflow.tracking import MlflowClient

In [None]:
# Set the experiment name
mlflow.set_experiment("Abalone_age_prediction")
# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id
    # Set tags for the run
    ...
    # Load data
    df = load_data(data_path)
    # train test split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    # Extract X and y
    X_train, y_train = extract_x_y(train_df)
    # Define pipeline 
    pipeline = define_pipeline(['Sex'], KNeighborsRegressor)
    # Train model
    model = train_model(pipeline, X_train, y_train)
    # Evaluate model
    prediction = predict_age(X_train, model)
    train_me = evaluate_model(y_train, prediction)
    # Evaluate model on test set
    X_test, y_test = extract_x_y(test_df)
    y_pred_test = predict_age(X_test, model)
    test_me = evaluate_model(y_test, y_pred_test)
    # Log your model
    run_id = run.info.run_id #permet de savoir ou chercher le modèle
    # MLflow log param
    mlflow.log_metric('test_me', test_me)
    mlflow.sklearn.log_model(model, "model.pkl")
    mlflow.register_model(f"runs:/{run_id}/models", "Abalone_age_prediction")

In [None]:
client = MlflowClient()
experiments = client.search_experiments()
experiments

In [None]:
!mlflow ui --host 0.0.0.0 --port 5002