In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from time import time
from time import ctime

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook
from tqdm import tqdm

import joblib
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()-1

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import optuna
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/home/dominique/Projects/predict_volcanic_eruptions/tfresh/train.csv', sep = ';')
train.set_index('Unnamed: 0', inplace = True)
test = pd.read_csv('/home/dominique/Projects/predict_volcanic_eruptions/tfresh/test.csv', sep = ';')
test.set_index('Unnamed: 0', inplace = True)

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
def objective(trial):

    Y = train['time_to_eruption']
    X = train.drop(['time_to_eruption'], axis = 1)
    X_test = test

    n_fold = 3
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=42)

    oof = np.zeros(len(X))
    cat_prediction = np.zeros(len(X_test))
    mae, r2 = [], []

    PARAMS = {
                 'random_seed': 42,
                 'eval_metric': 'MAE', #Also used as the eval metric for competition
                 'iterations': 100,
                 'eta': trial.suggest_float('eta',0.03,0.1),
                 'subsample': trial.suggest_float('subsample',0.7,1.0),
                 'l2_leaf_reg' : trial.suggest_float("lambda_l2", 1e-3, 10.0, log=True),
            }

    for fold_n, (train_index, valid_index) in enumerate(cv.split(X)):

        X_train = X.iloc[train_index,:]
        X_valid = X.iloc[valid_index,:]

        Y_train = Y.iloc[train_index]
        Y_valid = Y.iloc[valid_index]

        best_model = CatBoostRegressor(**PARAMS, thread_count = -1)  

        train_dataset = Pool(data=X_train,
                         label=Y_train,
                         )

        eval_dataset = Pool(data=X_valid,
                        label=Y_valid,
                        )

        best_model.fit(train_dataset,
                  use_best_model=True,
                  verbose = False,
                  eval_set=eval_dataset)


        y_pred = best_model.predict(Pool(data=X_valid))

        mae.append(mean_absolute_error(Y_valid, y_pred))
        r2.append(r2_score(Y_valid, y_pred))

        cat_prediction += best_model.predict(Pool(data=X_test))

    cat_prediction /= n_fold

    
    submission = pd.DataFrame()
    submission['segment_id'] = test.index
    submission['time_to_eruption'] = cat_prediction
    filename = 'submission' + str(trial.number) + '.csv'
    submission.to_csv(filename, header=True, index=False)

    return np.mean(mae)

In [None]:
study_name = 'catboost_study' 
study = optuna.create_study(direction="minimize",load_if_exists = True,study_name=study_name, storage='sqlite:///catboost_study.db')
study.optimize(objective, n_trials=1000)