In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from datetime import datetime as dt
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold,train_test_split

from sklearn.metrics import r2_score

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

FEATS_EXCLUDED = ['tradeMoney','ID']

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    

In [2]:
import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna

# evaluation metric


def the_metric(y_pred, y):
    y_true = y.get_label()
    return 'r2', r2_score(y_true, y_pred)

def objective(trial):
    
        #train_df['tradeMoney'] = np.log1p(train_df['tradeMoney'])
        train_x, train_y = train_df[feats], train_df['tradeMoney']
        train_y = np.log1p(train_y)
        dtrain = lgb.Dataset(train_x, label=train_y.ravel())
        
        params = {'objective': 'regression',
                  #'metric': 'rmse',
                  'verbosity': -1,
                  "learning_rate": trial.suggest_uniform('learning_rate', 0.001, 1),
                  
                  'device': 'gpu',
                  'gpu_platform_id': 1,
                  'gpu_device_id': 0,
                  'num_thread' : 1,
                  'sparse_threshold' : 1,
                  
                  'seed': 2019,
                  #'boosting_type': trial.suggest_categorical('boosting', ['gbdt',  'goss']),
                  'num_leaves': trial.suggest_int('num_leaves', 16, 200),
                  #'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.001, 1),
                  'subsample': trial.suggest_uniform('subsample', 0.001, 1),
                  'max_depth': trial.suggest_int('max_depth', 5, 20),
                  'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
                  # 'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
                  #'min_split_gain': trial.suggest_uniform('min_split_gain', 0, 10),
                  #'min_child_weight': trial.suggest_uniform('min_child_weight', 0, 45),
                  #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 64)
                  
                  'min_child_samples' : trial.suggest_int('min_child_samples', 1, 200),
                  #'num_iterations': trial.suggest_uniform('num_iterations', 1, 5000),
                  'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.001, 1),
                  #'random_state': trial.suggest_int('random_state', 1, 5000),
                  #'max_bin' :  trial.suggest_int('random_state', 1, 256)
                  }
        
        gbm = lgb.train(params, dtrain , feval= the_metric)
        preds = gbm.predict(train_x)
        pred_labels = np.rint(preds)
        accuracy = r2_score(np.expm1(train_y),np.expm1(preds))
        
        gc.collect()
        return 1 - accuracy


Using TensorFlow backend.


In [3]:
with timer("split train & test"):
        train_df = reduce_mem_usage(pd.read_csv('train_clean1.csv',encoding= "gbk"))
        test_df = reduce_mem_usage(pd.read_csv('test_clean1.csv',encoding= "gbk"))
        

        feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
        
        '''study = optuna.create_study()
        study.optimize(objective, n_trials=30)

        print('Number of finished trials: {}'.format(len(study.trials)))

        print('Best trial:')
        trial = study.best_trial

        print('  Value: {}'.format(trial.value))

        print('  Params: ')
        for key, value in trial.params.items():
            print('    {}: {}'.format(key, value))
        
        hist_df = study.trials_dataframe()
        hist_df.to_csv("optuna_result_lgbm.csv")

        #del df
        gc.collect()'''

Memory usage after optimization is: 9.68 MB
Decreased by 77.5%



invalid value encountered in less


invalid value encountered in less



Memory usage after optimization is: 0.59 MB
Decreased by 77.1%
split train & test - done in 2s


In [4]:
best_params = {
            'device': 'gpu',
            'gpu_platform_id': 1,
            'gpu_device_id': 0,
            'objective': 'regression_l2', 
            'boosting_type': 'gbdt', 
            'n_jobs': 4, 'max_depth': 7, 
            'n_estimators': 2000, 
            'subsample_freq': 2, 
            'subsample_for_bin': 200000, 
            'min_data_per_group': 100, 
            'max_cat_to_onehot': 4, 
            'cat_l2': 10.0, 
            'cat_smooth': 10.0, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'metric': 'rmse', 
            #'colsample_bytree': 0.5, 
            'learning_rate': 0.6538894827626766, 
            'min_child_samples': 85, 
            'min_child_weight': 100.0, 
            'min_split_gain': 1e-06, 
            'num_leaves': 124, 
            'reg_alpha': 9.729326680590926, 
            'reg_lambda': 10.0, 
            'subsample': 0.5955628212118322,
            'max_depth': 7,
            #'feature_fraction': 0.6520882094013536
}





In [6]:
train = pd.read_csv('train_clean1.csv',encoding= "gbk")
test = pd.read_csv('test_clean1.csv',encoding= "gbk")

In [8]:
Xtrain = train.drop('tradeMoney',axis=1)
Xtrain = train.drop('ID',axis=1)
ytrain = np.log1p(train.tradeMoney.values)

In [9]:
def lgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds), 'name'

X_tr, X_te, y_tr, y_te = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=0.2, random_state=4)
lgb_params = {}
lgb_params['boost'] = 'gbdt'
lgb_params['objective'] = 'regression_l2'
lgb_params['num_leaves'] = 128
lgb_params['sub_feature'] = 0.8 
lgb_params['max_depth'] = 9
lgb_params['feature_fraction'] = 0.7
lgb_params['bagging_fraction'] = 0.7
lgb_params['bagging_freq'] = 50
lgb_params['learning_rate'] = 0.01
lgb_params['num_iterations'] = 1500
lgb_params['early_stopping_round'] = 50
lgb_params['verbose'] = 2


ytra = y_train.ravel()
yte = y_test.ravel()
lgb_train = lgb.Dataset(X_train, label=ytra)
lgb_test = lgb.Dataset(X_test, label=yte)
lightgbm = lgb.train(lgb_params, lgb_train, num_boost_round=1500, verbose_eval=100, feval = lgb_r2_score,
                     valid_sets=[lgb_train,lgb_test])
print('LGB Model R2 Score: ', r2_score(np.expm1(lightgbm.predict(X_te)), np.expm1(y_te)))


Found `num_iterations` in params. Will use it instead of argument


Found `early_stopping_round` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.0754371	training's r2: 0.8104	valid_1's l2: 0.0738694	valid_1's r2: 0.811488
[200]	training's l2: 0.0243513	training's r2: 0.938797	valid_1's l2: 0.0228304	valid_1's r2: 0.941738
[300]	training's l2: 0.0142932	training's r2: 0.964076	valid_1's l2: 0.0135326	valid_1's r2: 0.965465
[400]	training's l2: 0.0118225	training's r2: 0.970286	valid_1's l2: 0.0118699	valid_1's r2: 0.969708
[500]	training's l2: 0.0109778	training's r2: 0.972409	valid_1's l2: 0.0111637	valid_1's r2: 0.971511
[600]	training's l2: 0.00972917	training's r2: 0.975547	valid_1's l2: 0.0100399	valid_1's r2: 0.974379
[700]	training's l2: 0.00885919	training's r2: 0.977734	valid_1's l2: 0.00961174	valid_1's r2: 0.975471
[800]	training's l2: 0.00830913	training's r2: 0.979116	valid_1's l2: 0.00926377	valid_1's r2: 0.976359
Early stopping, best iteration is:
[804]	training's l2: 0.00829533	training's r2: 0.979151	valid_1's l2: 0.00925803	val

In [65]:
def modeling_lgbm_cross_validation(params, X, y, nr_folds=5, verbose=0):
    clfs = list()
    oof_preds = np.zeros(X.shape[0])
    # Split data with kfold
    # kfolds = TimeSeriesSplit(n_splits=nr_folds)
    kfolds = StratifiedKFold(n_splits=nr_folds, shuffle=True, random_state=42)

    kfolds = KFold(n_splits=nr_folds, shuffle=True, random_state=42)
    for n_fold, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        if verbose:
            print('no {} of {} folds'.format(n_fold, nr_folds))

        X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
        
        y_train,y_valid = np.log1p(y_train).values , np.log1p(y_valid).values

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            # eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_set=[(X_valid, y_valid)],
            verbose=verbose, eval_metric='rmse',
            early_stopping_rounds=500
        )

        clfs.append(model)
        oof_preds[val_idx] = model.predict(X_valid, num_iteration=model.best_iteration_)

        del X_train, y_train, X_valid, y_valid
        gc.collect()
    np.savetxt("lgb_oof_preds.csv", oof_preds, delimiter=",")    
    
    score = r2_score(y, np.expm1(oof_preds))
    np.savetxt("y.csv", y, delimiter=",")
    np.savetxt("oof_preds.csv", np.expm1(oof_preds), delimiter=",")
    return clfs, score


In [67]:
def predict_cross_validation(test, clfs, ntree_limit=None):
    sub_preds = np.zeros(test.shape[0])
    for i, model in enumerate(clfs, 1):

        num_tree = 10000
        if not ntree_limit:
            ntree_limit = num_tree

        if isinstance(model, lgb.sklearn.LGBMRegressor):
            if model.best_iteration_:
                num_tree = min(ntree_limit, model.best_iteration_)

            test_preds = model.predict(test, raw_score=True, num_iteration=num_tree)

        sub_preds += test_preds

    sub_preds = sub_preds / len(clfs)
    sub_preds = np.expm1(sub_preds)
    np.savetxt("lgb_sub_preds.csv", sub_preds, delimiter=",")
    ret = pd.Series(sub_preds, index=test.index)
    ret.index.name = test.index.name
    return ret

In [68]:
nr_folds = 2
clfs, score = modeling_lgbm_cross_validation(best_params,
                                                    train_df[feats],
                                                    train_df['tradeMoney'],
                                                    nr_folds,
                                                    verbose=50)
        # save to
file_template = '{score:.6f}_{model_key}_cv{fold}_{timestamp}'
file_stem = file_template.format(
    score=score,
    model_key='LGBM',
    fold=nr_folds,
    timestamp=dt.now().strftime('%Y-%m-%d-%H-%M'))

filename = 'subm_{}.csv'.format(file_stem)
print('save to {}'.format(filename))
subm = predict_cross_validation(test_df[feats], clfs)
subm = subm.to_frame('tradeMoney')
subm.to_csv(filename, index=True)

no 0 of 2 folds
Training until validation scores don't improve for 500 rounds.
[50]	valid_0's rmse: 0.30084
[100]	valid_0's rmse: 0.299395
[150]	valid_0's rmse: 0.29905
[200]	valid_0's rmse: 0.29999
[250]	valid_0's rmse: 0.300577
[300]	valid_0's rmse: 0.301165
[350]	valid_0's rmse: 0.301707
[400]	valid_0's rmse: 0.301886
[450]	valid_0's rmse: 0.301942
[500]	valid_0's rmse: 0.302599
[550]	valid_0's rmse: 0.302872
[600]	valid_0's rmse: 0.303402
Early stopping, best iteration is:
[120]	valid_0's rmse: 0.298423
no 1 of 2 folds
Training until validation scores don't improve for 500 rounds.
[50]	valid_0's rmse: 0.288469
[100]	valid_0's rmse: 0.288976
[150]	valid_0's rmse: 0.289651
[200]	valid_0's rmse: 0.290631
[250]	valid_0's rmse: 0.291566
[300]	valid_0's rmse: 0.292191
[350]	valid_0's rmse: 0.292983
[400]	valid_0's rmse: 0.293187
[450]	valid_0's rmse: 0.294216
[500]	valid_0's rmse: 0.295047
[550]	valid_0's rmse: 0.295465
Early stopping, best iteration is:
[88]	valid_0's rmse: 0.28784
save