In [4]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


FEATS_EXCLUDED = [
    "ID","tradeMoney"
    ]
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    




In [5]:
import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna

FEATS_EXCLUDED = [
    "ID","tradeMoney"
    ]


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def objective(trial):
        num_folds = 11
        
        train_x, train_y = train_df[feats], train_df['tradeMoney']
        #data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
        '''dtrain = lgb.Dataset(train_x, label=train_y)'''
             
        lgbm_train = lgb.Dataset(train_x,
                                 train_y,
                                  free_raw_data=False
                                  )

        params = {'objective': 'regression',
                  'metric': 'rmse',
                  'verbosity': -1,
                  "learning_rate": trial.suggest_uniform('learning_rate', 0.001, 1),
                  
                  'device': 'gpu',
                  'gpu_platform_id': 1,
                  'gpu_device_id': 0,
                  'num_thread' : 1,
                  'sparse_threshold' : 1,
                  
                  'seed': 2779,
                  #'boosting_type': trial.suggest_categorical('boosting', ['gbdt',  'goss']),
                  'num_leaves': trial.suggest_int('num_leaves', 16, 200),
                  #'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.001, 1),
                  'subsample': trial.suggest_uniform('subsample', 0.001, 1),
                  'max_depth': trial.suggest_int('max_depth', 5, 20),
                  'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
                  # 'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
                  #'min_split_gain': trial.suggest_uniform('min_split_gain', 0, 10),
                  #'min_child_weight': trial.suggest_uniform('min_child_weight', 0, 45),
                  #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 64)
                  
                  'min_child_samples' : trial.suggest_int('min_child_samples', 1, 200),
                  #'num_iterations': trial.suggest_uniform('num_iterations', 1, 5000),
                  'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.001, 1),
                  #'random_state': trial.suggest_int('random_state', 1, 5000),
                  #'max_bin' :  trial.suggest_int('random_state', 1, 256)
                  }

        '''if params['boosting_type'] == 'dart':
            params['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
            params['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
        if params['boosting_type'] == 'goss':
            params['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
            params['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - params['top_rate'])'''


        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
        #folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)


        clf = lgb.cv(
                        params,
                        lgbm_train,
                        metrics=['rmse'],
                        nfold=num_folds,
                        folds=folds.split(train_df[feats], train_y),
                        num_boost_round=10000,
                        early_stopping_rounds= 500,
                        verbose_eval=100,
                        seed=47
            
                         )
        gc.collect()
        return clf['rmse-mean'][-1]

        
        '''gbm = lgb.train(params, dtrain)
        preds = gbm.predict(train_x)
        pred_labels = np.rint(preds)
        error =  rmse(train_y,preds)
        #accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
        return error'''

In [None]:
with timer("split train & test"):
        train_df = reduce_mem_usage(pd.read_csv('train_clean4.csv',encoding='gbk'))
        test_df = reduce_mem_usage(pd.read_csv('test_clean4.csv',encoding='gbk'))
        
        train_df = train_df[train_df.tradeMoney < 100000]
        train_df = train_df[train_df.tradeMoney > 500]

        train_df = train_df[train_df.area < 2500]
       
        feature_importance_df = pd.DataFrame()
        feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
        
        study = optuna.create_study()
        study.optimize(objective, n_trials=1000)

        print('Number of finished trials: {}'.format(len(study.trials)))

        print('Best trial:')
        trial = study.best_trial

        print('  Value: {}'.format(trial.value))

        print('  Params: ')
        for key, value in trial.params.items():
            print('    {}: {}'.format(key, value))
        
        hist_df = study.trials_dataframe()
        hist_df.to_csv("optuna_result_lgbm.csv")

        del df
        gc.collect()

Memory usage after optimization is: 8.02 MB
Decreased by 79.9%



invalid value encountered in less


invalid value encountered in less



Memory usage after optimization is: 0.49 MB
Decreased by 79.5%



The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=11.



[100]	cv_agg's rmse: 1988.03 + 550.864
[200]	cv_agg's rmse: 2000.33 + 533.573
[300]	cv_agg's rmse: 2015.66 + 529.546
[400]	cv_agg's rmse: 2022.48 + 529.901
