In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
#https://www.kaggle.com/johnfarrell/breaking-lb-fresh-start-with-lag-selection/output
train_base=pd.read_csv('train.csv')
test_base=pd.read_csv('test.csv')

train_leak=pd.read_csv('train_leak.csv')
test_leak=pd.read_csv('test_leak.csv')

trainleak = train_leak['compiled_leak'].values
trainlogleak = np.log1p(train_leak['compiled_leak'].values)
testleak = test_leak['compiled_leak'].values
testlogleak = np.log1p(test_leak['compiled_leak'].values)

In [None]:
def drop_vars(df):
    tmp=df.shape[1]
    df = df[df.columns[[True]+list((df.var()!=0))]]
    print('0 var:',tmp-df.shape[1])
    
    corr_matrix = df[df.columns[2:]].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    tmp=df.shape[1]
    df=df.drop(to_drop, axis=1)
    print('Corr>0.95:',tmp-df.shape[1])
    
    corrs = dict()
    for i in range(df.shape[1]-2):
        corrs[df.columns[2+i]] = np.corrcoef(df['target'],df[df.columns[2+i]])[0,1]
    s = [k for k in corrs if abs(corrs[k])<0.1]
    tmp=df.shape[1]
    df=df.drop(s, axis=1)
    print('Corr Target <0.1:',tmp-df.shape[1])
    
    return df
tmp=drop_vars(train_base)

In [None]:
train = tmp.copy()
test = test_base.copy()

train['leak'] = trainleak
train['log_leak'] = trainlogleak
test['leak'] = testleak
test['log_leak'] = testlogleak

train = train[np.isfinite(train['leak'])]
test = test[np.isfinite(test['leak'])]

#https://www.kaggle.com/ogrellier/feature-scoring-vs-zeros/notebook
features = [f for f in train if f not in ['ID', 'leak', 'log_leak', 'target']]
train.replace(0, np.nan, inplace=True)
train['log_of_mean'] = np.log1p(train[features].replace(0, np.nan).mean(axis=1))
train['mean_of_log'] = np.log1p(train[features]).replace(0, np.nan).mean(axis=1)
train['log_of_median'] = np.log1p(train[features].replace(0, np.nan).median(axis=1))
train['nnans'] = train[features].isnull().sum(axis=1)
train['sum'] = np.log1p(train[features].sum(axis=1))
train['std'] = train[features].std(axis=1)
train['kurtosis'] = train[features].kurtosis(axis=1)

test.replace(0, np.nan, inplace=True)
test['log_of_mean'] = np.log1p(test[features].replace(0, np.nan).mean(axis=1))
test['mean_of_log'] = np.log1p(test[features]).replace(0, np.nan).mean(axis=1)
test['log_of_median'] = np.log1p(test[features].replace(0, np.nan).median(axis=1))
test['nnans'] = test[features].isnull().sum(axis=1)
test['sum'] = np.log1p(test[features].sum(axis=1))
test['std'] = test[features].std(axis=1)
test['kurtosis'] = test[features].kurtosis(axis=1)

In [None]:
train.replace(np.nan,0, inplace=True)
test.replace(np.nan,0, inplace=True)

## Test Improvement

In [None]:
X=train[train.columns[2:]]
y=np.log1p(train['target'])
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=203)

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return abs(np.sqrt(np.mean((y_pred - y_true)**2))) 

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'gamma',
    'metric': {'rmse'},
    'num_leaves': 50,
    'learning_rate': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 4,
    'max_depth': -1,
    'reg_alpha': 0.3,
    'reg_lambda': 0.1,
    'min_child_weight': 10,
    'zero_as_missing': True,
    'verbose': 1
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('Start predicting...')
# predict
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', root_mean_squared_error(y_test, y_pred))

From one of the successful kernels:

In [30]:
import time
learning_rates = [0.015,0.01]
for param in learning_rates:
    print("Learning Rate: ", param)
    modelstart= time.time()
    params["learning_rate"] = param
    # Find Optimal Parameters / Boosting Rounds
    lgb_cv = lgb.cv(
        params = params,
        train_set = lgb_train,
        num_boost_round=1000,
        stratified=False,
        nfold = 5,
        verbose_eval=200,
        seed = 23,
        early_stopping_rounds=75)

    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    best_cv_score = min(lgb_cv['rmse-mean'])

    print("Optimal Round: {}\nOptimal Score: {} + {}".format(
        optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))
    print("###########################################################################################")

Learning Rate:  0.015
[200]	cv_agg's rmse: 0.73944 + 0.0362683
[400]	cv_agg's rmse: 0.717533 + 0.0390627
Optimal Round: 498
Optimal Score: 0.7159211643356415 + 0.03957621054593416
###########################################################################################
Learning Rate:  0.01
[200]	cv_agg's rmse: 0.789428 + 0.0344841
[400]	cv_agg's rmse: 0.721741 + 0.0397164
[600]	cv_agg's rmse: 0.714193 + 0.0402347
[800]	cv_agg's rmse: 0.713488 + 0.0402673
Optimal Round: 739
Optimal Score: 0.713152119703968 + 0.040588511405705394
###########################################################################################


In [33]:
%%time
x_train=train[train.columns[2:]]
y_train=np.log1p(train['target'])

x_test=test[x_train.columns]

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return abs(K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)))

lgb_train = lgb.Dataset(x_train, y_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'gamma',
    'metric': {'rmse'},
    'num_leaves': 50,
    'learning_rate': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 4,
    'max_depth': -1,
    'reg_alpha': 0.3,
    'reg_lambda': 0.1,
    'min_child_weight': 10,
    'zero_as_missing': True,
    'verbose': 0
}

# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000)

y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)

Wall time: 1min 26s


In [34]:
predictions=pd.DataFrame({'ID':test['ID'],'target':np.expm1(y_pred)})
print(predictions.head())
predictions.to_csv('pred_lgbmleak.csv',index=False)

          ID        target
0  000137c73  1.576021e+06
1  00021489f  2.404527e+06
2  0004d7953  2.769994e+06
3  00056a333  8.087247e+06
4  00056d8eb  2.404527e+06
