In [None]:
import re
import sys
import glob
import math
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
from time import ctime
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import lightgbm as lgb
from sklearn.model_selection import KFold
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_frags = glob.glob("./train_fft/*")
test_frags = glob.glob("./test_fft/*")
train = pd.read_csv("train.csv")
Y = pd.Series(0, index=np.arange(len(train_frags)))

i = 0
for file in train_frags:
    start = './train_fft/'
    end = '.csv'
    seg_id = file[file.find(start)+len(start):file.rfind(end)]
    t2e = int(train.loc[train['segment_id'] == int(seg_id)]['time_to_eruption'].values)
    Y.iloc[i] = t2e
    i = i + 1

test_id = pd.Series(0, index=np.arange(len(test_frags)))    
i = 0
for file in test_frags:
    start = './test_fft/'
    end = '.csv'
    seg_id = file[file.find(start)+len(start):file.rfind(end)]

    test_id.iloc[i] = seg_id
    i = i + 1


In [None]:
def objective(trial):
    params = { "objective": "regression",
              "metric": "mae",
              "verbosity": -1,
              "boosting_type": "gbdt",
              "num_iterations": 1000,
              "early_stopping_round": 5,
              "n_jobs": -1,
              'num_leaves': trial.suggest_int("nleaves", 32, 256),
              'bagging_fraction': trial.suggest_float('bagfrac',0.7,1.0),
              'bagging_freq': trial.suggest_int("bagfreq", 1, 10),
              'feature_fraction': trial.suggest_float('featfrac',0.7,1.0),
              'lambda_l1': trial.suggest_float("lambda_l1", 1e-6, 10.0, log=True),
              'lambda_l2': trial.suggest_float("lambda_l2", 1e-6, 10.0, log=True),
              'min_child_samples': trial.suggest_int("minchild", 1, 100)}
    
    all_pca = pd.read_csv('pca_30.csv')

    X = all_pca[:len(train_frags)]
    X_test = all_pca[-len(test_frags):]

    n_fold = 5
    mae, r2 = [], []
    predicted_times = np.zeros(len(X_test))
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=42)


    for fold_n, (train_index, valid_index) in enumerate(cv.split(X)):

        X_train = X.iloc[train_index,:]
        X_valid = X.iloc[valid_index,:]

        Y_train = Y.iloc[train_index]
        Y_valid = Y.iloc[valid_index]

        dtrain = lgb.Dataset(X_train, label=Y_train)
        dval = lgb.Dataset(X_valid, label=Y_valid)

        model = lgb.train( params, dtrain, valid_sets=[dtrain, dval], verbose_eval=-1 )

        y_pred = model.predict(X_valid)
        mae.append(mean_absolute_error(Y_valid, y_pred))
        r2.append(r2_score(Y_valid, y_pred))

        predicted_times += model.predict(X_test)

    predicted_times /= n_fold

    filename = 'fold_submission' + str(trial.number) + '.csv'
    submission = pd.DataFrame({
            "segment_id": test_id,
            "time_to_eruption": predicted_times
        })

    submission.to_csv(filename, index=False)
    
    return np.mean(mae)



In [None]:
study_name = 'lgbm_study' 
study = optuna.create_study(direction="minimize",load_if_exists = True,study_name=study_name, storage='sqlite:///lgbm_study.db')
study.optimize(objective, n_trials=1000)