# Import Libraries

In [256]:
import keras
import pandas as pd
import category_encoders as ce
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error, mean_absolute_error

# Preprocessing Data

Source: https://www.kaggle.com/datasets/patelris/crop-yield-prediction-dataset?select=yield_df.csv

Dataset features: 
- Categorical features: "Area", "Item", "Year"
- Continuous features: "average_rain_fall_mm_per_year", "pesticides_tonnes", "avg_temp"
- Target: "hg/ha_yield"

## Import Data

In [257]:
df = pd.read_csv("data.csv")
cat_cols = ["Area", "Item"]
cont_cols = ["average_rain_fall_mm_per_year", "pesticides_tonnes", "avg_temp"]
target_col = "hg/ha_yield"

df



## Data Cleaning

In [258]:
# drop columns that has no meaning
df.drop(columns=['Unnamed: 0', 'Year'], axis='columns',inplace=True)
df.head()



In [259]:
df.isna().sum()



## Splitting Training and Test Set

In [260]:
X = df.drop(columns=[target_col])
y = df[target_col].values.reshape(-1, 1)

In [261]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=766)

## Encoding Categorical Features

In [262]:
item_encoder = ce.OneHotEncoder(
    cols='Item',
    handle_unknown='return_nan',
    return_df=True,
    use_cat_names=True
)
X_train = item_encoder.fit_transform(X_train)
X_test = item_encoder.transform(X_test)

In [263]:
X_train



In [264]:
area_encoder = ce.BaseNEncoder(
    cols='Area',
    base=3,
    handle_unknown='return_nan',
    return_df=True,
)
X_train = area_encoder.fit_transform(X_train)
X_test = area_encoder.transform(X_test)

In [265]:
X_train



In [266]:
X_train.shape



In [267]:
X_train.corr()



# Model Selection

### TabNet Regressor Hyperpamater Tuning

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.preprocessing import LabelEncoder
import numpy as np



In [None]:
import optuna

def objective(trial):
    optimizer_params = dict(lr=trial.suggest_float('lambda_sparse', low=1e-6, high=1e-1))
    scheduler_params = {"step_size": trial.suggest_int('scheduler_step_size', low=3, high=40), "gamma": trial.suggest_float('scheduler_gamma', low=0.01, high=0.9)}
    scheduler_fn = torch.optim.lr_scheduler.StepLR
    optimizer_name = trial.suggest_categorical(
        'optimizer_fn', 
        [
            "Adam", 
            "AdamW",
            "Adamax",
            "Adadelta",
            "Adagrad",
            "SGD",
            "RMSprop",
            "Rprop"
        ]
    )
    optimizer_fn = getattr(torch.optim, optimizer_name) 

    n_d = trial.suggest_int('n_d', low=1, high=32)
    n_a = trial.suggest_int('n_a', low=1, high=32)
    n_steps = trial.suggest_int('n_steps', low=3, high=40)
    gamma = trial.suggest_float('gamma', low=0.01, high=0.9)
    lambda_sparse = trial.suggest_float('lambda_sparse', low=1e-6, high=1e-1)
    verbose = 1
    device_name = 'cuda'


    regressor = TabNetRegressor(
        optimizer_params=optimizer_params,
        scheduler_params=scheduler_params,
        scheduler_fn=scheduler_fn,
        optimizer_fn=optimizer_fn,
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        verbose=verbose,
        device_name=device_name,
    )
    regressor.fit(X_train=X_train, y_train=y_train,
                eval_set=[(X_test, y_test)],
                patience=15,
                max_epochs=100,
                batch_size=trial.suggest_int('batch_size', low=256, high=1024, step=256),
                eval_metric=['mse'])
    return r2_score(y_test, regressor.predict(X_test))


In [None]:
import logging, sys
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(
    direction='maximize',
    study_name='TabNet Hyperparameter Optimization',
    sampler=optuna.samplers.RandomSampler(),
    storage='sqlite:///tabnet.db',
    load_if_exists=True,
)





In [None]:
study.optimize(objective, n_trials=100)

# Results

In [279]:
study.best_params



In [297]:
study.best_trial



In [380]:
study.best_trial.value



In [381]:
def detailed_objective(trial):

    optimizer_params = dict(lr=trial.suggest_float('lambda_sparse', low=1e-6, high=1e-1))
    scheduler_params = {"step_size": trial.suggest_int('scheduler_step_size', low=3, high=40), "gamma": trial.suggest_float('scheduler_gamma', low=0.01, high=0.9)}
    scheduler_fn = torch.optim.lr_scheduler.StepLR
    optimizer_name = trial.suggest_categorical(
        'optimizer_fn', 
        [
            "Adam", 
            "AdamW",
            "Adamax",
            "Adadelta",
            "Adagrad",
            "SGD",
            "RMSprop",
            "Rprop"
        ]
    )
    optimizer_fn = getattr(torch.optim, optimizer_name) 

    n_d = trial.suggest_int('n_d', low=1, high=32)
    n_a = trial.suggest_int('n_a', low=1, high=32)
    n_steps = trial.suggest_int('n_steps', low=3, high=40)
    gamma = trial.suggest_float('gamma', low=0.01, high=0.9)
    lambda_sparse = trial.suggest_float('lambda_sparse', low=1e-6, high=1e-1)
    verbose = 1
    device_name = 'cuda'


    regressor = TabNetRegressor(
        optimizer_params=optimizer_params,
        scheduler_params=scheduler_params,
        scheduler_fn=scheduler_fn,
        optimizer_fn=optimizer_fn,
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        verbose=verbose,
        device_name=device_name,
    )
    regressor.fit(X_train=X_train, y_train=y_train,
                eval_set=[(X_test, y_test)],
                patience=15,
                max_epochs=100,
                batch_size=trial.suggest_int('batch_size', low=256, high=1024, step=256),
                eval_metric=['mse'])

    r2 = r2_score(y_test, regressor.predict(X_test))
    mse = mean_squared_error(y_test, regressor.predict(X_test))
    rmse = root_mean_squared_error(y_test, regressor.predict(X_test))
    mae = mean_absolute_error(y_test, regressor.predict(X_test))

    return r2, mse, rmse, mae

In [382]:
detailed_objective(study.best_trial)









In [311]:
plt = optuna.visualization.plot_optimization_history(study)
plt.update_layout(
    width=1000,
    height=1000,
    title_font_size=36,
    legend_font_size=24,
    font_size=18,
    font_family='Times New Roman'
)



In [None]:
plt_parallel = optuna.visualization.plot_parallel_coordinate(study, params=['optimizer_fn', 'scheduler_step_size', 'lambda_sparse'])
plt_parallel.update_layout(
    margin=dict(b=150),
    width=1100,
    height=750,
    title_font_size=36,
    legend_font_size=24,
    font_size=24,
    font_family='Times New Roman',
    title='Notable Parameter Parallel Coordinate'
)



In [284]:
optuna.visualization.plot_slice(study)



In [326]:
plt_contour = optuna.visualization.plot_contour(study, params=['optimizer_fn', 'scheduler_step_size', 'lambda_sparse'])
plt_contour.update_layout(
    width=2000,
    height=2000,
    title_font_size=36,
    legend_font_size=24,
    font_size=18,
    font_family='Times New Roman'
)



In [325]:
plt_contour_optim_sched = optuna.visualization.plot_contour(study, params=['optimizer_fn', 'scheduler_step_size'])
plt_contour_optim_sched.update_layout(
    width=1000,
    height=1000,
    title_font_size=36,
    legend_font_size=24,
    font_size=22,
    font_family='Times New Roman',
    title='Relationship between Optimizer Function and Scheduler Step Size'
)



In [346]:
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Optimizer Function and Lambda Sparse", "Scheduler Step Size and Lambda Sparse")
)

fig_contour_optim = optuna.visualization.plot_contour(study, params=['optimizer_fn', 'lambda_sparse'])
fig_contour_sched = optuna.visualization.plot_contour(study, params=['scheduler_step_size', 'lambda_sparse'])

for trace in fig_contour_optim.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig_contour_sched.data:
    fig.add_trace(trace, row=2, col=1)

fig.update_layout(
    width=1500,
    height=1200,
    title='Relationship between Lambda Sparse and Other Notable Hyperparameters',
    title_font_size=46,
    legend_font_size=24,
    font=dict(
        size=32,
        family='Times New Roman'
    )
)

for annotation in fig['layout']['annotations']:
    annotation['font'] = dict(size=32)

fig.update_xaxes(title_text="lambda_sparse", row=1, col=1)
fig.update_yaxes(title_text="optimizer_fn", row=1, col=1)
fig.update_xaxes(title_text="lambda_sparse", row=2, col=1)
fig.update_yaxes(title_text="scheduler_step_size", row=2, col=1)

fig.show()




In [316]:
plt_importance = optuna.visualization.plot_param_importances(study)
plt_importance.update_layout(
    width=1000,
    height=1000,
    title_font_size=36,
    font_size=24,
    font_family='Times New Roman'
)



In [None]:
plt_rank = optuna.visualization.plot_rank(study, para)
plt_rank.update_layout(
    width=3000,
    height=3000,
    title_font_size=36,
    font_size=24,
    font_family='Times New Roman'
)



In [288]:
optuna.visualization.plot_timeline(study)



In [295]:
optuna.visualization.matplotlib.plot_edf(study)





