In [25]:
import numpy as np
import pickle

with open('cache/traintest.pkl', 'rb') as f:
    df_train, df_test, train_y = pickle.load(f)
    
with open('cache/traintest_decomposition2_50.pkl', 'rb') as f:
    out50_train, out50_test = pickle.load(f)
        
with open('cache/non_zero_stats.pkl', 'rb') as f:
    agg_train, agg_test = pickle.load(f)        

In [26]:
from sklearn.feature_selection import VarianceThreshold    
import pandas as pd
from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb


In [27]:
df_train.shape, out50_train.shape, agg_train.shape

((4459, 4735), (4459, 100), (4459, 18))

In [28]:


def low_variance(df_train):
    s = VarianceThreshold(threshold=.0)
    s.fit(df_train)
    idxs = s.get_support()
    to_remove = [c for c, x in zip(df_train.columns, idxs) if not x]
    return to_remove

def nunique(df_train):
    _, index = np.unique(df_train.values, return_index=True, axis=1)
    return [c for idx, c in enumerate(df_train.columns) if idx not in index]

def suspicious(df_train):
    # drop zero fraction
    thr = 0.98
    to_ignore = []
    for column in df_train.columns:
        counts = df_train[column].value_counts()
        value_fraction = counts[0] / len(df_train)
        if value_fraction >= thr:
            to_ignore.append(column)
    return to_ignore



In [29]:
to_ignore = set()
to_ignore |= set(low_variance(df_train))
to_ignore |= set(nunique(df_train))
to_ignore |= set(suspicious(df_train))

In [30]:
len(to_ignore)

2614

In [31]:
features = [c for c in df_train if c not in to_ignore]

In [32]:
len(features)

2121

In [33]:
train_X = pd.concat([df_train[features], out50_train, agg_train], axis=1)

In [39]:
train_X = pd.concat([out50_train, agg_train], axis=1)

In [40]:
train_X.shape

(4459, 118)

In [35]:

def run_cv1(train_X, train_y):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 180,
        "max_depth": -1,
        "learning_rate" : 0.01,
        'min_child_weight': 10,
        'reg_lambda': 0.1,
        'subsample': 0.5,
        'scale_pos_weight': 1,
        'reg_alpha': 0.0,
        'colsample_bytree': 0.5,
        'max_bin': 255,
        'subsample_freq': 4,
        'min_child_samples': 1,
        
        
        #'min_data_in_leaf': 100,
        #"bagging_fraction" : 0.7,
        #"feature_fraction" : 0.1,
        #"bagging_frequency" : 1,
        #"bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, 
                          label=train_y)
    hist = lgb.cv(params, 
                  lgtrain, 
                  10000, 
                  nfold=5, 
                  early_stopping_rounds=100, 
                  stratified=False,
                  verbose_eval=50)


In [93]:
def run_cv2(train_X, train_y):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 127,
        "max_depth": 6,
        "learning_rate" : 0.005,
        'min_child_weight': 10,
        'reg_lambda': 0.1,
        'subsample': 0.5,
        'scale_pos_weight': 1,
        'reg_alpha': 0.0,
        #'colsample_bytree': 0.5,
        'max_bin': 255,
        # 'subsample_freq': 4,
        #'min_child_samples': 1,
                
        #'min_data_in_leaf': 100,
        "bagging_fraction" : 0.2,
        "feature_fraction" : 0.15,
        "bagging_frequency" : 10,
        #"bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, 
                          label=train_y)
    hist = lgb.cv(params, 
                  lgtrain, 
                  10000, 
                  nfold=5, 
                  early_stopping_rounds=100, 
                  stratified=False,
                  verbose_eval=50)

In [94]:
#run_cv(train_X, train_y)
run_cv2(train_X, train_y)

[50]	cv_agg's rmse: 1.63862 + 0.0339141
[100]	cv_agg's rmse: 1.55571 + 0.0288839
[150]	cv_agg's rmse: 1.49253 + 0.0243665
[200]	cv_agg's rmse: 1.44848 + 0.0208897
[250]	cv_agg's rmse: 1.41825 + 0.0185473
[300]	cv_agg's rmse: 1.39655 + 0.0164074
[350]	cv_agg's rmse: 1.38128 + 0.0154288
[400]	cv_agg's rmse: 1.36972 + 0.0147235
[450]	cv_agg's rmse: 1.36155 + 0.0143164
[500]	cv_agg's rmse: 1.35585 + 0.0141555
[550]	cv_agg's rmse: 1.35141 + 0.0141661
[600]	cv_agg's rmse: 1.34777 + 0.0141796
[650]	cv_agg's rmse: 1.34514 + 0.0143395
[700]	cv_agg's rmse: 1.34308 + 0.0145328
[750]	cv_agg's rmse: 1.34174 + 0.0147824
[800]	cv_agg's rmse: 1.34091 + 0.014901
[850]	cv_agg's rmse: 1.34013 + 0.0153245
[900]	cv_agg's rmse: 1.33949 + 0.0153963
[950]	cv_agg's rmse: 1.33898 + 0.0153247
[1000]	cv_agg's rmse: 1.33889 + 0.0155649
[1050]	cv_agg's rmse: 1.33858 + 0.015684
[1100]	cv_agg's rmse: 1.3384 + 0.0158184
[1150]	cv_agg's rmse: 1.33821 + 0.0160593
[1200]	cv_agg's rmse: 1.33807 + 0.0162804
[1250]	cv_agg's

[50]	cv_agg's rmse: 1.5136 + 0.0274603
[100]	cv_agg's rmse: 1.41094 + 0.0193085
[150]	cv_agg's rmse: 1.36727 + 0.0179535
[200]	cv_agg's rmse: 1.34997 + 0.0174103
[250]	cv_agg's rmse: 1.3438 + 0.0174458
[300]	cv_agg's rmse: 1.34152 + 0.0189359
[350]	cv_agg's rmse: 1.34045 + 0.0191162
[400]	cv_agg's rmse: 1.34092 + 0.0192083


In [None]:
"""
[50]	cv_agg's rmse: 1.517 + 0.0271728
[100]	cv_agg's rmse: 1.41489 + 0.0165783
[150]	cv_agg's rmse: 1.37202 + 0.013006
[200]	cv_agg's rmse: 1.35475 + 0.0114454
[250]	cv_agg's rmse: 1.34908 + 0.0120152
[300]	cv_agg's rmse: 1.34677 + 0.012814
[350]	cv_agg's rmse: 1.34612 + 0.0133199
[400]	cv_agg's rmse: 1.34716 + 0.0137825
[450]	cv_agg's rmse: 1.34801 + 0.0132945

"""

In [95]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 127,
        "max_depth": 6,
        "learning_rate" : 0.005,
        'min_child_weight': 10,
        'reg_lambda': 0.1,
        'subsample': 0.5,
        'scale_pos_weight': 1,
        'reg_alpha': 0.0,
        #'colsample_bytree': 0.5,
        'max_bin': 255,
        # 'subsample_freq': 4,
        #'min_child_samples': 1,
                
        #'min_data_in_leaf': 100,
        "bagging_fraction" : 0.2,
        "feature_fraction" : 0.15,
        "bagging_frequency" : 10,
        #"bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, 
                      lgtrain, 
                      num_boost_round=10000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=200, 
                      evals_result=evals_result)
    
    #pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return model, evals_result

In [96]:
#test_X = pd.concat([df_test[features], out100_test, agg_test], axis=1)
test_X = pd.concat([out50_test, agg_test], axis=1)

                    
folds = 3
kf = model_selection.KFold(n_splits=folds, shuffle=True, random_state=0)

sub_preds = np.zeros(test_X.shape[0])
oof_preds = np.zeros(train_X.shape[0])

for fold_no, (dev_index, val_index) in enumerate(kf.split(train_X)):
    dev_X, val_X = train_X.loc[dev_index,:], train_X.loc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)
    
    oof_preds[val_index] = model.predict(val_X, num_iteration=model.best_iteration)
    fold_rmse = np.sqrt(metrics.mean_squared_error(val_y, oof_preds[val_index]))
    print(f"fold {fold_no}: ", fold_rmse)    
    
    pred_test = model.predict(test_X, num_iteration=model.best_iteration)    
    sub_preds += np.expm1(pred_test) / folds

Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 1.34303	valid_1's rmse: 1.45576
[400]	training's rmse: 1.17747	valid_1's rmse: 1.3735
[600]	training's rmse: 1.08174	valid_1's rmse: 1.34897
[800]	training's rmse: 1.01719	valid_1's rmse: 1.34243
[1000]	training's rmse: 0.969818	valid_1's rmse: 1.33968
[1200]	training's rmse: 0.927572	valid_1's rmse: 1.33838
[1400]	training's rmse: 0.88945	valid_1's rmse: 1.33817
Early stopping, best iteration is:
[1364]	training's rmse: 0.896116	valid_1's rmse: 1.33791
fold 0:  1.337908326377923
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 1.34709	valid_1's rmse: 1.45252
[400]	training's rmse: 1.17712	valid_1's rmse: 1.36703
[600]	training's rmse: 1.07979	valid_1's rmse: 1.34392
[800]	training's rmse: 1.00923	valid_1's rmse: 1.33792
[1000]	training's rmse: 0.955367	valid_1's rmse: 1.33627
[1200]	training's rmse: 0.913423	valid_1's rmse: 1.33629
Early stopping, best iteration i

Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.834297	valid_1's rmse: 1.26202
[400]	training's rmse: 0.515362	valid_1's rmse: 1.23768
Early stopping, best iteration is:
[495]	training's rmse: 0.421357	valid_1's rmse: 1.23653
fold 0:  1.236528004279091
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.828484	valid_1's rmse: 1.43361
Early stopping, best iteration is:
[214]	training's rmse: 0.797659	valid_1's rmse: 1.43121
fold 1:  1.4312126457898486
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.825663	valid_1's rmse: 1.31121
[400]	training's rmse: 0.509487	valid_1's rmse: 1.29656
Early stopping, best iteration is:
[449]	training's rmse: 0.457818	valid_1's rmse: 1.29514
fold 2:  1.2951411121611769
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.829188	valid_1's rmse: 1.36447
Early stopping, best iteration is:
[287]	training's rmse

In [97]:
sample = pd.read_csv('input/test.csv', usecols=['ID'])
sample["target"] = sub_preds
sample.to_csv("submissions/subm_012b.csv", index=False)

In [98]:
subm_000 = pd.read_csv('submissions/subm_000.csv')
subm_010 = pd.read_csv('submissions/subm_010.csv')
subm_011 = pd.read_csv('submissions/subm_011.csv')
subm_012a = pd.read_csv('submissions/subm_012a.csv')
subm_012b = pd.read_csv('submissions/subm_012b.csv')


In [99]:
subm_012a['subm_000'] = subm_000.target.values
subm_012a['subm_010'] = subm_010.target.values
subm_012a['subm_011'] = subm_011.target.values
subm_012a['subm_012b'] = subm_012b.target.values

In [102]:
subm_012b.target = subm_012b.target * 0.6 + subm_000.target * 0.2 + subm_010.target * 0.2
subm_012b.to_csv("submissions/subm_012.csv", index=False)