In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [9]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

def drop_vars(df):
    tmp=df.shape[1]
    df = df[df.columns[[True]+list((df.var()!=0))]]
    print('0 var:',tmp-df.shape[1])
    
    corr_matrix = df[df.columns[2:]].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    tmp=df.shape[1]
    df=df.drop(to_drop, axis=1)
    print('Corr>0.95:',tmp-df.shape[1],'Now:',)
    
    corrs = dict()
    for i in range(df.shape[1]-2):
        corrs[df.columns[2+i]] = np.corrcoef(df['target'],df[df.columns[2+i]])[0,1]
    s = [k for k in corrs if abs(corrs[k])<0.1]
    tmp=df.shape[1]
    df=df.drop(s, axis=1)
    print('Corr Target <0.1:',tmp-df.shape[1])
    
    return df
train = drop_vars(train)

0 var: 256
Corr>0.95: 139 Now:
Corr Target <0.1: 4206


## Test Improvement

In [23]:
X=train[train.columns[2:]]
y=np.log1p(train['target'])

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=203)

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return abs(np.sqrt(np.mean((y_pred - y_true)**2))) 

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'gamma',
    'metric': {'rmse'},
    'num_leaves': 50,
    'learning_rate': 0.008,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 4,
    'max_depth': -1,
    'reg_alpha': 0.3,
    'reg_lambda': 0.1,
    'min_child_weight': 10,
    'zero_as_missing': True,
    'verbose': 1
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('Start predicting...')
# predict
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', root_mean_squared_error(y_test, y_pred))

Start training...
[1]	valid_0's rmse: 1.73905
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 1.73622
[3]	valid_0's rmse: 1.7331
[4]	valid_0's rmse: 1.73015
[5]	valid_0's rmse: 1.72709
[6]	valid_0's rmse: 1.72404
[7]	valid_0's rmse: 1.72075
[8]	valid_0's rmse: 1.71815
[9]	valid_0's rmse: 1.71527
[10]	valid_0's rmse: 1.71258
[11]	valid_0's rmse: 1.71013
[12]	valid_0's rmse: 1.70768
[13]	valid_0's rmse: 1.70481
[14]	valid_0's rmse: 1.70174
[15]	valid_0's rmse: 1.69912
[16]	valid_0's rmse: 1.69655
[17]	valid_0's rmse: 1.69368
[18]	valid_0's rmse: 1.69111
[19]	valid_0's rmse: 1.68873
[20]	valid_0's rmse: 1.68614
[21]	valid_0's rmse: 1.68322
[22]	valid_0's rmse: 1.68106
[23]	valid_0's rmse: 1.67854
[24]	valid_0's rmse: 1.67562
[25]	valid_0's rmse: 1.67322
[26]	valid_0's rmse: 1.67093
[27]	valid_0's rmse: 1.66864
[28]	valid_0's rmse: 1.66628
[29]	valid_0's rmse: 1.66408
[30]	valid_0's rmse: 1.66164
[31]	valid_0's rmse: 1.65921
[32]	valid_0's rmse: 1.6569
[33]

[277]	valid_0's rmse: 1.43525
[278]	valid_0's rmse: 1.43531
[279]	valid_0's rmse: 1.43532
[280]	valid_0's rmse: 1.43521
[281]	valid_0's rmse: 1.4347
[282]	valid_0's rmse: 1.43434
[283]	valid_0's rmse: 1.43388
[284]	valid_0's rmse: 1.43373
[285]	valid_0's rmse: 1.43341
[286]	valid_0's rmse: 1.43306
[287]	valid_0's rmse: 1.43251
[288]	valid_0's rmse: 1.43233
[289]	valid_0's rmse: 1.43217
[290]	valid_0's rmse: 1.43191
[291]	valid_0's rmse: 1.43167
[292]	valid_0's rmse: 1.4315
[293]	valid_0's rmse: 1.43156
[294]	valid_0's rmse: 1.43136
[295]	valid_0's rmse: 1.43105
[296]	valid_0's rmse: 1.43092
[297]	valid_0's rmse: 1.43081
[298]	valid_0's rmse: 1.4306
[299]	valid_0's rmse: 1.43025
[300]	valid_0's rmse: 1.43022
[301]	valid_0's rmse: 1.43001
[302]	valid_0's rmse: 1.42978
[303]	valid_0's rmse: 1.42938
[304]	valid_0's rmse: 1.42922
[305]	valid_0's rmse: 1.42901
[306]	valid_0's rmse: 1.42899
[307]	valid_0's rmse: 1.4288
[308]	valid_0's rmse: 1.42851
[309]	valid_0's rmse: 1.42823
[310]	valid_0'

From one of the successful ker

In [19]:
import time
learning_rates = [0.012,0.008]
for param in learning_rates:
    print("Learning Rate: ", param)
    modelstart= time.time()
    params["learning_rate"] = param
    # Find Optimal Parameters / Boosting Rounds
    lgb_cv = lgb.cv(
        params = params,
        train_set = lgb_train,
        num_boost_round=10000,
        stratified=False,
        nfold = 5,
        verbose_eval=200,
        seed = 23,
        early_stopping_rounds=75)

    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    best_cv_score = min(lgb_cv['rmse-mean'])

    print("Optimal Round: {}\nOptimal Score: {} + {}".format(
        optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))
    print("###########################################################################################")

Learning Rate:  0.1
Optimal Round: 48
Optimal Score: 1.4591752720018802 + 0.04680836418661446
###########################################################################################
Learning Rate:  0.01
[200]	cv_agg's rmse: 1.46691 + 0.0411651
[400]	cv_agg's rmse: 1.43651 + 0.0468959
[600]	cv_agg's rmse: 1.4336 + 0.0501861
Optimal Round: 551
Optimal Score: 1.4333652915263275 + 0.04992601520324436
###########################################################################################


In [25]:
x_train=train[train.columns[2:]]
y_train=np.log1p(train['target'])

x_test=test[x_train.columns]

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return abs(K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)))

lgb_train = lgb.Dataset(x_train, y_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'gamma',
    'metric': {'rmse'},
    'num_leaves': 50,
    'learning_rate': 0.008,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 4,
    'max_depth': -1,
    'reg_alpha': 0.3,
    'reg_lambda': 0.1,
    'min_child_weight': 10,
    'zero_as_missing': True,
    'verbose': 0
}

# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000)

y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)

In [26]:
predictions=pd.DataFrame({'ID':test['ID'],'target':np.expm1(y_pred)})
print(predictions.head())
predictions.to_csv('pred_pure_lightgbm.csv',index=False)

          ID        target
0  000137c73  1.627427e+06
1  00021489f  1.977776e+06
2  0004d7953  3.926302e+06
3  00056a333  7.194788e+06
4  00056d8eb  1.977776e+06
