In [3]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

data_dir = "../Data/"

def rmsle(val, pred):
    return np.sqrt(np.mean(np.power(np.log(val+1)-np.log(pred+1), 2)))

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [4]:
train_df = pd.read_csv(data_dir+"train.csv")
test_df = pd.read_csv(data_dir+"test.csv")

In [6]:
#https://www.kaggle.com/samratp/lightgbm-xgboost-catboost
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log(train_df["target"].values +1)

X_test = test_df.drop(["ID"], axis=1)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42)

In [12]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.0025,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 10000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=500, 
                      verbose_eval=50, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [13]:
red_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")

Training until validation scores don't improve for 500 rounds.
[50]	valid_0's rmse: 1.64311
[100]	valid_0's rmse: 1.60888
[150]	valid_0's rmse: 1.57975
[200]	valid_0's rmse: 1.5555
[250]	valid_0's rmse: 1.53453
[300]	valid_0's rmse: 1.51733
[350]	valid_0's rmse: 1.50249
[400]	valid_0's rmse: 1.48962
[450]	valid_0's rmse: 1.47907
[500]	valid_0's rmse: 1.47034
[550]	valid_0's rmse: 1.46215
[600]	valid_0's rmse: 1.4558
[650]	valid_0's rmse: 1.44974
[700]	valid_0's rmse: 1.44496
[750]	valid_0's rmse: 1.44039
[800]	valid_0's rmse: 1.4373
[850]	valid_0's rmse: 1.4342
[900]	valid_0's rmse: 1.43152
[950]	valid_0's rmse: 1.42964
[1000]	valid_0's rmse: 1.42802
[1050]	valid_0's rmse: 1.42682
[1100]	valid_0's rmse: 1.42581
[1150]	valid_0's rmse: 1.42443
[1200]	valid_0's rmse: 1.42365
[1250]	valid_0's rmse: 1.42309
[1300]	valid_0's rmse: 1.42267
[1350]	valid_0's rmse: 1.42178
[1400]	valid_0's rmse: 1.4211
[1450]	valid_0's rmse: 1.42085
[1500]	valid_0's rmse: 1.42053
[1550]	valid_0's rmse: 1.42055
[

In [14]:
from sklearn.cross_validation import KFold

In [16]:
#https://www.kaggle.com/rooshroosh/lightgbm-baseline-1-49-lb
train_df = pd.read_csv(data_dir+"train.csv")
test_df = pd.read_csv(data_dir+"test.csv")

In [17]:
train=train_df
test=test_df

In [18]:
Y = np.log(train.target+1)

train.drop(['target'], axis=1, inplace=True)

test_ID = test.ID
test.drop(['ID'], axis=1, inplace=True)

train_ID = train.ID
train.drop(['ID'], axis=1, inplace=True)

In [19]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 12,
    'num_leaves': 63,  #  127, 255
    'feature_fraction': 0.8,   # 0.1, 0.01
    'bagging_fraction': 0.8,
    'learning_rate': 0.0025,
    'verbose': 1
}

In [21]:
import time

In [22]:
Y_target = []
for fold_id,(train_idx, val_idx) in enumerate(KFold(n=train.shape[0], n_folds=10, random_state=42)):
    print('FOLD:',fold_id)
    X_train = train.values[train_idx]
    y_train = Y.values[train_idx]
    X_valid = train.values[val_idx]
    y_valid =  Y.values[val_idx]
    
    
    lgtrain = lgb.Dataset(X_train, y_train,
                feature_name=train.columns.tolist(),
    #             categorical_feature = categorical
                         )

    lgvalid = lgb.Dataset(X_valid, y_valid,
                feature_name=train.columns.tolist(),
    #             categorical_feature = categorical
                         )

    modelstart = time.time()
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=30000,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    test_pred = lgb_clf.predict(test.values)
    Y_target.append(np.exp(test_pred)-1)
    print('fold finish after', time.time()-modelstart)

FOLD: 0
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 1.63075	valid's rmse: 1.61858
[200]	train's rmse: 1.52952	valid's rmse: 1.56752
[300]	train's rmse: 1.44766	valid's rmse: 1.52678
[400]	train's rmse: 1.38077	valid's rmse: 1.49704
[500]	train's rmse: 1.32384	valid's rmse: 1.47317
[600]	train's rmse: 1.27541	valid's rmse: 1.45664
[700]	train's rmse: 1.23466	valid's rmse: 1.44541
[800]	train's rmse: 1.20297	valid's rmse: 1.43717
[900]	train's rmse: 1.17394	valid's rmse: 1.43045
[1000]	train's rmse: 1.14989	valid's rmse: 1.42596
[1100]	train's rmse: 1.12874	valid's rmse: 1.42342
[1200]	train's rmse: 1.11097	valid's rmse: 1.4208
[1300]	train's rmse: 1.09689	valid's rmse: 1.41854
[1400]	train's rmse: 1.08352	valid's rmse: 1.4157
[1500]	train's rmse: 1.07161	valid's rmse: 1.41345
[1600]	train's rmse: 1.06085	valid's rmse: 1.41177
[1700]	train's rmse: 1.05035	valid's rmse: 1.40958
[1800]	train's rmse: 1.04073	valid's rmse: 1.40791
[1900]	train's rmse: 1

[1400]	train's rmse: 1.08804	valid's rmse: 1.36551
[1500]	train's rmse: 1.07644	valid's rmse: 1.36325
[1600]	train's rmse: 1.06474	valid's rmse: 1.3614
[1700]	train's rmse: 1.05552	valid's rmse: 1.36002
[1800]	train's rmse: 1.04604	valid's rmse: 1.35825
[1900]	train's rmse: 1.03721	valid's rmse: 1.35754
[2000]	train's rmse: 1.02836	valid's rmse: 1.35723
[2100]	train's rmse: 1.02077	valid's rmse: 1.35716
[2200]	train's rmse: 1.01316	valid's rmse: 1.35705
[2300]	train's rmse: 1.00585	valid's rmse: 1.35691
[2400]	train's rmse: 0.999138	valid's rmse: 1.35687
[2500]	train's rmse: 0.992781	valid's rmse: 1.35693
Early stopping, best iteration is:
[2413]	train's rmse: 0.998381	valid's rmse: 1.3567
fold finish after 103.0801510810852
FOLD: 6
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 1.62741	valid's rmse: 1.65445
[200]	train's rmse: 1.5284	valid's rmse: 1.59198
[300]	train's rmse: 1.44731	valid's rmse: 1.54628
[400]	train's rmse: 1.38107	valid's rmse: 1.5

In [24]:
Y_target = np.array(Y_target)

sub = pd.read_csv(data_dir+'/sample_submission.csv')
sub['target'] = Y_target.mean(axis=0)
sub.to_csv('sub_lgb_test.csv', index=False)