In [1]:
import datetime
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold, StratifiedKFold

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


FEATS_EXCLUDED = [
    "ID","tradeMoney"
    ]
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    




In [2]:
%%time
train_df = reduce_mem_usage(pd.read_csv('train_clean4.csv',encoding='gbk'))
test_df = reduce_mem_usage(pd.read_csv('test_clean4.csv',encoding='gbk'))

train_df = train_df[train_df.tradeMoney < 100000]
train_df = train_df[train_df.tradeMoney > 500]

train_df = train_df[train_df.area < 2500]

Memory usage after optimization is: 8.02 MB
Decreased by 79.9%
Memory usage after optimization is: 0.49 MB
Decreased by 79.5%
Wall time: 1.88 s




In [11]:
import xgboost as xgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def objective(trial):
    
        data = train_df[feats]
        target = train_df['tradeMoney']
        train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)
        dtrain = xgb.DMatrix(train_x, label=train_y)
        dtest = xgb.DMatrix(test_x, label=test_y)
        

        param = {'objective': 'reg:linear',
                  'eval_metric': 'rmse',
                  'verbosity': -1,
                  "learning_rate": trial.suggest_uniform('learning_rate', 0.001, 1),
                  
                  'silent': 1,
                  'booster': trial.suggest_categorical('booster', ['gbtree','dart']),
                  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
                  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
                  
                  'gpu_id': 0,
                  'tree_method': 'gpu_hist',
                  'max_bin': 16,
                  #'updater' : 'grow_gpu_hist',

                
                  'seed': 326,
                  'boosting_type': trial.suggest_categorical('boosting', ['gbdt',  'goss']),
                  'num_leaves': trial.suggest_int('num_leaves', 16, 64),
                  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.001, 1),
                  'subsample': trial.suggest_uniform('subsample', 0.001, 1),
                  'max_depth': trial.suggest_int('max_depth', 5, 20),
                  #'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
                  'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
                  #'min_split_gain': trial.suggest_uniform('min_split_gain', 0, 10),
                  #'min_child_weight': trial.suggest_uniform('min_child_weight', 0, 45),
                  #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 64)
                  }


        
        
        if param['booster'] == 'gbtree' or param['booster'] == 'dart':
            param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
            param['eta'] = trial.suggest_loguniform('eta', 1e-8, 1.0)
            param['gamma'] = trial.suggest_loguniform('gamma', 1e-8, 1.0)
            param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
        if param['booster'] == 'dart':
            param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
            param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
            param['rate_drop'] = trial.suggest_loguniform('rate_drop', 1e-8, 1.0)
            param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)

        gbm = xgb.train(param, dtrain)
        preds = gbm.predict(dtest)
        pred_labels = np.rint(preds)
        error =  rmse(test_y,pred_labels)
        #error =  1- r2_score(test_y,pred_labels)
        
        return error



In [None]:
with timer("split train & test"):
        '''train_df = reduce_mem_usage(pd.read_csv('train_clean.csv'))
        test_df = reduce_mem_usage(pd.read_csv('test_clean.csv'))'''
       
        feature_importance_df = pd.DataFrame()
        feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
        
        study = optuna.create_study()
        study.optimize(objective, n_trials=1000)

        print('Number of finished trials: {}'.format(len(study.trials)))

        print('Best trial:')
        trial = study.best_trial

        print('  Value: {}'.format(trial.value))

        print('  Params: ')
        for key, value in trial.params.items():
            print('    {}: {}'.format(key, value))
        
        hist_df = study.trials_dataframe()
        hist_df.to_csv("optuna_result_xgbm.csv")

        del df
        gc.collect()

[I 2019-05-04 16:03:28,935] Finished a trial resulted in value: 4325.4375. Current best value is 4325.4375 with parameters: {'learning_rate': 0.06138479314724347, 'booster': 'dart', 'lambda': 8.449791040976869e-07, 'alpha': 0.006591511153106916, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.17330989435610533, 'subsample': 0.9916028193873515, 'max_depth': 14, 'reg_lambda': 3.1842599432376204, 'eta': 0.29097853069728014, 'gamma': 0.29996202441182496, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 2.3019765108605859e-07, 'skip_drop': 0.002106456175938353}.
[I 2019-05-04 16:03:33,672] Finished a trial resulted in value: 2237.461181640625. Current best value is 2237.461181640625 with parameters: {'learning_rate': 0.3180224548421979, 'booster': 'gbtree', 'lambda': 0.7619224947308133, 'alpha': 0.332401416792678, 'boosting': 'gbdt', 'num_leaves': 18, 'colsample_bytree': 0.4887069720218272, 'subsample': 0.19164379410931862, 'max_dep

[I 2019-05-04 16:04:48,623] Finished a trial resulted in value: 2809.40380859375. Current best value is 2033.48583984375 with parameters: {'learning_rate': 0.5820147549415967, 'booster': 'gbtree', 'lambda': 0.014015602972770155, 'alpha': 2.105448880451939e-08, 'boosting': 'gbdt', 'num_leaves': 26, 'colsample_bytree': 0.8167610781202721, 'subsample': 0.2677891352781538, 'max_depth': 18, 'reg_lambda': 6.624365273294457, 'eta': 2.1947957911316236e-06, 'gamma': 1.1322523815144847e-08, 'grow_policy': 'lossguide'}.
[I 2019-05-04 16:04:49,435] Finished a trial resulted in value: 2484.859130859375. Current best value is 2033.48583984375 with parameters: {'learning_rate': 0.5820147549415967, 'booster': 'gbtree', 'lambda': 0.014015602972770155, 'alpha': 2.105448880451939e-08, 'boosting': 'gbdt', 'num_leaves': 26, 'colsample_bytree': 0.8167610781202721, 'subsample': 0.2677891352781538, 'max_depth': 18, 'reg_lambda': 6.624365273294457, 'eta': 2.1947957911316236e-06, 'gamma': 1.1322523815144847e-08

[I 2019-05-04 16:06:46,266] Finished a trial resulted in value: 2048.273681640625. Current best value is 1934.1112060546875 with parameters: {'learning_rate': 0.24189945854996434, 'booster': 'gbtree', 'lambda': 3.7709696657671657e-06, 'alpha': 5.8712464328072807e-08, 'boosting': 'gbdt', 'num_leaves': 16, 'colsample_bytree': 0.8527633227996427, 'subsample': 0.6463647524161322, 'max_depth': 19, 'reg_lambda': 7.322784829382216, 'eta': 1.341325246713775e-06, 'gamma': 0.19784495919025932, 'grow_policy': 'lossguide'}.
[I 2019-05-04 16:07:01,132] Finished a trial resulted in value: 2406.8837890625. Current best value is 1934.1112060546875 with parameters: {'learning_rate': 0.24189945854996434, 'booster': 'gbtree', 'lambda': 3.7709696657671657e-06, 'alpha': 5.8712464328072807e-08, 'boosting': 'gbdt', 'num_leaves': 16, 'colsample_bytree': 0.8527633227996427, 'subsample': 0.6463647524161322, 'max_depth': 19, 'reg_lambda': 7.322784829382216, 'eta': 1.341325246713775e-06, 'gamma': 0.19784495919025

[I 2019-05-04 16:08:12,230] Finished a trial resulted in value: 5192.07470703125. Current best value is 1839.9019775390625 with parameters: {'learning_rate': 0.3195593845405922, 'booster': 'dart', 'lambda': 0.0003380049383428135, 'alpha': 3.9686105020244555e-07, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.5225571507004209, 'subsample': 0.8450476547813186, 'max_depth': 10, 'reg_lambda': 1.9663642196299582, 'eta': 0.3851408015617113, 'gamma': 0.003668834520741664, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.00018784608783659206, 'skip_drop': 0.5180619371939924}.
[I 2019-05-04 16:08:15,033] Finished a trial resulted in value: 3209.3798828125. Current best value is 1839.9019775390625 with parameters: {'learning_rate': 0.3195593845405922, 'booster': 'dart', 'lambda': 0.0003380049383428135, 'alpha': 3.9686105020244555e-07, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.5225571507004209, 'subsample': 0.84504765

[I 2019-05-04 16:09:00,732] Finished a trial resulted in value: 2698.93603515625. Current best value is 1839.9019775390625 with parameters: {'learning_rate': 0.3195593845405922, 'booster': 'dart', 'lambda': 0.0003380049383428135, 'alpha': 3.9686105020244555e-07, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.5225571507004209, 'subsample': 0.8450476547813186, 'max_depth': 10, 'reg_lambda': 1.9663642196299582, 'eta': 0.3851408015617113, 'gamma': 0.003668834520741664, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.00018784608783659206, 'skip_drop': 0.5180619371939924}.
[I 2019-05-04 16:09:01,630] Finished a trial resulted in value: 2497.462158203125. Current best value is 1839.9019775390625 with parameters: {'learning_rate': 0.3195593845405922, 'booster': 'dart', 'lambda': 0.0003380049383428135, 'alpha': 3.9686105020244555e-07, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.5225571507004209, 'subsample': 0.845047

[I 2019-05-04 16:11:29,109] Finished a trial resulted in value: 2114.7412109375. Current best value is 1839.9019775390625 with parameters: {'learning_rate': 0.3195593845405922, 'booster': 'dart', 'lambda': 0.0003380049383428135, 'alpha': 3.9686105020244555e-07, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.5225571507004209, 'subsample': 0.8450476547813186, 'max_depth': 10, 'reg_lambda': 1.9663642196299582, 'eta': 0.3851408015617113, 'gamma': 0.003668834520741664, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.00018784608783659206, 'skip_drop': 0.5180619371939924}.
[I 2019-05-04 16:11:55,610] Finished a trial resulted in value: 2318.7275390625. Current best value is 1839.9019775390625 with parameters: {'learning_rate': 0.3195593845405922, 'booster': 'dart', 'lambda': 0.0003380049383428135, 'alpha': 3.9686105020244555e-07, 'boosting': 'goss', 'num_leaves': 64, 'colsample_bytree': 0.5225571507004209, 'subsample': 0.845047654

[I 2019-05-04 16:13:47,140] Finished a trial resulted in value: 2192.646484375. Current best value is 1838.716552734375 with parameters: {'learning_rate': 0.3707045289149264, 'booster': 'gbtree', 'lambda': 0.001166561721462739, 'alpha': 1.3162678884445367e-07, 'boosting': 'goss', 'num_leaves': 44, 'colsample_bytree': 0.9903841363138319, 'subsample': 0.6923111350700597, 'max_depth': 20, 'reg_lambda': 7.789891275581384, 'eta': 1.8888546685721775e-08, 'gamma': 0.042663659605457524, 'grow_policy': 'lossguide'}.
[I 2019-05-04 16:13:48,223] Finished a trial resulted in value: 2090.467041015625. Current best value is 1838.716552734375 with parameters: {'learning_rate': 0.3707045289149264, 'booster': 'gbtree', 'lambda': 0.001166561721462739, 'alpha': 1.3162678884445367e-07, 'boosting': 'goss', 'num_leaves': 44, 'colsample_bytree': 0.9903841363138319, 'subsample': 0.6923111350700597, 'max_depth': 20, 'reg_lambda': 7.789891275581384, 'eta': 1.8888546685721775e-08, 'gamma': 0.042663659605457524, 

[I 2019-05-04 16:15:16,283] Finished a trial resulted in value: 2322.64013671875. Current best value is 1803.4058837890625 with parameters: {'learning_rate': 0.3381497766879947, 'booster': 'gbtree', 'lambda': 0.0020253017527964786, 'alpha': 0.014183788568303723, 'boosting': 'goss', 'num_leaves': 51, 'colsample_bytree': 0.47558146997448086, 'subsample': 0.8646875650923772, 'max_depth': 15, 'reg_lambda': 5.340070953505367, 'eta': 0.609849295307442, 'gamma': 0.0035463100337004313, 'grow_policy': 'lossguide'}.
[I 2019-05-04 16:15:19,020] Finished a trial resulted in value: 2129.177490234375. Current best value is 1803.4058837890625 with parameters: {'learning_rate': 0.3381497766879947, 'booster': 'gbtree', 'lambda': 0.0020253017527964786, 'alpha': 0.014183788568303723, 'boosting': 'goss', 'num_leaves': 51, 'colsample_bytree': 0.47558146997448086, 'subsample': 0.8646875650923772, 'max_depth': 15, 'reg_lambda': 5.340070953505367, 'eta': 0.609849295307442, 'gamma': 0.0035463100337004313, 'gro

[I 2019-05-04 16:16:55,386] Finished a trial resulted in value: 2070.983154296875. Current best value is 1803.4058837890625 with parameters: {'learning_rate': 0.3381497766879947, 'booster': 'gbtree', 'lambda': 0.0020253017527964786, 'alpha': 0.014183788568303723, 'boosting': 'goss', 'num_leaves': 51, 'colsample_bytree': 0.47558146997448086, 'subsample': 0.8646875650923772, 'max_depth': 15, 'reg_lambda': 5.340070953505367, 'eta': 0.609849295307442, 'gamma': 0.0035463100337004313, 'grow_policy': 'lossguide'}.
[I 2019-05-04 16:17:05,592] Finished a trial resulted in value: 2102.160400390625. Current best value is 1803.4058837890625 with parameters: {'learning_rate': 0.3381497766879947, 'booster': 'gbtree', 'lambda': 0.0020253017527964786, 'alpha': 0.014183788568303723, 'boosting': 'goss', 'num_leaves': 51, 'colsample_bytree': 0.47558146997448086, 'subsample': 0.8646875650923772, 'max_depth': 15, 'reg_lambda': 5.340070953505367, 'eta': 0.609849295307442, 'gamma': 0.0035463100337004313, 'gr

[I 2019-05-04 16:19:02,870] Finished a trial resulted in value: 2208.87939453125. Current best value is 1803.4058837890625 with parameters: {'learning_rate': 0.3381497766879947, 'booster': 'gbtree', 'lambda': 0.0020253017527964786, 'alpha': 0.014183788568303723, 'boosting': 'goss', 'num_leaves': 51, 'colsample_bytree': 0.47558146997448086, 'subsample': 0.8646875650923772, 'max_depth': 15, 'reg_lambda': 5.340070953505367, 'eta': 0.609849295307442, 'gamma': 0.0035463100337004313, 'grow_policy': 'lossguide'}.
[I 2019-05-04 16:19:11,928] Finished a trial resulted in value: 2069.279052734375. Current best value is 1803.4058837890625 with parameters: {'learning_rate': 0.3381497766879947, 'booster': 'gbtree', 'lambda': 0.0020253017527964786, 'alpha': 0.014183788568303723, 'boosting': 'goss', 'num_leaves': 51, 'colsample_bytree': 0.47558146997448086, 'subsample': 0.8646875650923772, 'max_depth': 15, 'reg_lambda': 5.340070953505367, 'eta': 0.609849295307442, 'gamma': 0.0035463100337004313, 'gro

In [None]:
#--------------------------------------