In [31]:
import logging
import os
import gc
import time
from datetime import datetime as dt

import numpy as np
import pandas as pd
from pandas.core.common import SettingWithCopyWarning

import warnings
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.metrics import mean_squared_error,r2_score

from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

from contextlib import contextmanager

logger = logging.getLogger(__name__)
logging.basicConfig(
    format='[%(levelname)s] %(asctime)s %(filename)s: %(lineno)d: %(message)s',
    datefmt='%Y-%m-%d:%H:%M:%S',
    level=logging.DEBUG)

DATE_TODAY = dt(2019, 1, 26)

FEATS_EXCLUDED = [
    "ID","tradeMoney"
    ]


@contextmanager
def timer(title):
    t0 = time.time()
    yield
    logger.info("{} - done in {:.0f}s".format(title, time.time() - t0))


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')


# reduce memory
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df


# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = df.columns.tolist()

    categorical_columns = list(filter(lambda c: c in ['object'], df.dtypes))
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)

    new_columns = list(filter(lambda c: c not in original_columns, df.columns))
    return df, new_columns





def modeling_lgbm_cross_validation(params, X, y, nr_folds=5, verbose=0):
    clfs = list()
    oof_preds = np.zeros(X.shape[0])
    # Split data with kfold
    # kfolds = TimeSeriesSplit(n_splits=nr_folds)
    kfolds = StratifiedKFold(n_splits=nr_folds, shuffle=True, random_state=42)
    #split_index = X[['feature_1', 'feature_2', 'feature_3']].apply(lambda x: np.log1p(x)).product(axis=1)
    kfolds = KFold(n_splits=nr_folds, shuffle=True, random_state=42)

    feature_importance_df = pd.DataFrame()

    y = np.log1p(y)
    
    for n_fold, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        if verbose:
            print('no {} of {} folds'.format(n_fold, nr_folds))

        X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        #y_train, y_valid= np.log1p(y_train.values) , np.log1p(y_valid.values)

        model = xgb.XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            # eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_set=[(X_valid, y_valid)],
            verbose=verbose, eval_metric='rmse',
            early_stopping_rounds=200
        )

        clfs.append(model)
        oof_preds[val_idx] = model.predict(X_valid, ntree_limit=model.best_ntree_limit)





        del X_train, y_train, X_valid, y_valid
        gc.collect()


    
    np.savetxt("xgb_oof_preds.csv", np.expm1(oof_preds), delimiter=",")    
    #score = mean_squared_error(y, oof_preds) ** .5

    score = r2_score(y, oof_preds)
    return clfs, score


def predict_cross_validation(test, clfs, ntree_limit=None):
    sub_preds = np.zeros(test.shape[0])
    for i, model in enumerate(clfs, 1):

        num_tree = 10000
        if not ntree_limit:
            ntree_limit = num_tree

        if isinstance(model, lgb.sklearn.LGBMRegressor):
            if model.best_iteration_:
                num_tree = min(ntree_limit, model.best_iteration_)

            test_preds = model.predict(test, raw_score=True, num_iteration=num_tree)

        if isinstance(model, xgb.sklearn.XGBRegressor):
            num_tree = min(ntree_limit, model.best_ntree_limit)
            test_preds = model.predict(test, ntree_limit=num_tree)

        sub_preds += test_preds

    sub_preds = sub_preds / len(clfs)
    sub_preds = np.expm1(sub_preds)
    np.savetxt("xgb_sub_preds.csv", sub_preds, delimiter=",")
    ret = pd.Series(sub_preds, index=test.index)
    ret.index.name = test.index.name
    return ret


def write_to_parquet(filename, df, debug=False):
    print('write to {}: {}'.format(filename, df.shape))

    # safety check
    cols_type = df.dtypes.to_dict()
    for col, col_type in cols_type.items():
        if str(col_type).startswith('float16'):
            df[col] = df[col].astype(np.float32)

    df.to_parquet(filename, engine='auto', compression='snappy')
    if debug:
        df = pd.read_parquet(filename)
        print('debug reload save file: {}\n{}'.format(df.shape, df.head().T))


def main(debug=False):
    num_rows = 10000 if debug else None
    
               
    with timer("Run LightGBM with kfold"):
        train_df = pd.read_csv('train_clean4.csv',encoding = 'gbk')
        test_df = pd.read_csv('test_clean4.csv',encoding = 'gbk')

        train_df = train_df[train_df.tradeMoney < 100000]
        train_df = train_df[train_df.tradeMoney > 500]

        train_df = train_df[train_df.area < 2500]

        train_features = [c for c in train_df.columns if c not in FEATS_EXCLUDED]

        '''train_df['area'] = np.log1p(train_df['area'])

        for i in train_features:
            if train_df[i].mean() > 2500:
                #print(i)
                train_df[i] = np.log1p(train_df[i])'''
                
       
        best_params = {
            'gpu_id': 0, 
            #'n_gpus': 2, 
            'objective': 'reg:linear', 
            'eval_metric': 'rmse', 
            'silent': True, 
            'booster': 'gbtree', 
            'n_jobs': 4, 
            'n_estimators': 2500, 
            #'tree_method': 'gpu_hist', 
            'grow_policy': 'lossguide', 
            'max_depth': 13, 
            'seed': 538, 
            'colsample_bylevel': 0.4577985063107066, 
            'colsample_bytree': 0.8971621421463886, 
            'gamma': 0.007219683251171169, 
            'learning_rate': 0.006150886706231842, 
            'num_leaves': 46, 
            'max_bin': 16,
            'min_child_weight': 6.584851275015851, 
            'reg_alpha': 1.476515526719819,
            'reg_lambda': 5.040088958844647, 
            'subsample': 0.7792358657530063}

        '''best_params = {
            'gpu_id': 0, 
            #'n_gpus': 2, 
            'objective': 'reg:linear', 
            'eval_metric': 'rmse', 
            'silent': True, 
            'booster': 'gbtree', 
            'n_jobs': 4, 
            'n_estimators': 2500, 
            #'tree_method': 'gpu_hist', 
            'grow_policy': 'lossguide', 
            'max_depth': 12, 
            'seed': 538, 
            'colsample_bylevel': 0.9, 
            'colsample_bytree': 0.8, 
            'gamma': 0.0001, 
            'learning_rate': 0.006150886706231842, 
            'max_bin': 16, 
            'max_leaves': 47, 
            'min_child_weight': 40, 
            'reg_alpha': 10.0, 
            'reg_lambda': 10.0, 
            'subsample': 0.9}'''

        
        

        # modeling
        nr_folds = 11
        if debug:
            nr_folds = 2
            
        best_params.update({'n_estimators': 200000})
        #best_params.update({'n_estimators': 20})
        
        clfs = list()
        score = 0
        clfs, score = modeling_lgbm_cross_validation(best_params,
                                                    train_df[train_features],
                                                    train_df['tradeMoney'],
                                                    nr_folds,
                                                    verbose=100)

                
        # save to
        file_template = '{score:.6f}_{model_key}_cv{fold}_{timestamp}'
        file_stem = file_template.format(
            score=score,
            model_key='XGB',
            fold=nr_folds,
            timestamp=dt.now().strftime('%Y-%m-%d-%H-%M'))

        

        filename = 'subm_{}.csv'.format(file_stem)
        print('save to {}'.format(filename))
        subm = predict_cross_validation(test_df[train_features], clfs)
        subm = subm.to_frame('target')
        subm.to_csv(filename, index=True)
        


if __name__ == "__main__":
    with timer("Full model run"):
        main(debug=False)



no 0 of 11 folds
[0]	validation_0-rmse:7.75335
Will train until validation_0-rmse hasn't improved in 200 rounds.
[100]	validation_0-rmse:4.19812


KeyboardInterrupt: 