In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor

lgb = pd.read_csv('../lgCV_2505.csv.gz', compression='gzip')
rnn = pd.read_csv('../rnnCV_2805.csv.gz', compression='gzip')
mlp = pd.read_csv('../mlpCV_2505.csv.gz', compression='gzip')
truth = pd.read_csv('../../train.csv.zip', compression='zip')
y = truth['deal_probability'].values
truth.drop('deal_probability', 1)
test =  pd.read_csv('../../test.csv.zip', compression='zip')
test['deal_probability']=float('NAN') 
truth = pd.concat([truth,test[truth.columns]],axis=0)

lgb.rename(columns={'deal_probability': 'lgb_preds' }, inplace=True)
mlp.rename(columns={'deal_probability': 'mlp_preds' }, inplace=True)
preds_df = lgb.merge(rnn, on='item_id').merge(mlp, on='item_id').merge(truth, on='item_id',how='left')


In [2]:
preds_df['difference_l_r']=preds_df['lgb_preds'] - preds_df['rnn_preds']
preds_df['price'].fillna(-1,inplace=True)

In [52]:
cols = ['lgb_preds','rnn_preds','mlp_preds','price', 'difference_l_r', 'region', 'param_1', 'parent_category_name', 'category_name']
for col in cols[5:]:
    preds_df[col] = LabelEncoder().fit_transform(preds_df[col].fillna("0"))
train_df = preds_df[~preds_df['deal_probability'].isnull()]
test_df = preds_df[preds_df['deal_probability'].isnull()]


full=True
if full:
    n_estimators = 500
    train_X = train_df[cols]
    train_y = y
    eval_set = [(train_X,train_y)]
else:
    n_estimators = 4000
    train_X, valid_X, train_y, valid_y = train_test_split(train_df[cols], y, train_size=.8, random_state=12345)
    eval_set = [(train_X,train_y),(valid_X,valid_y)]


In [53]:
len(train_X)

1503424

In [54]:
clf = LGBMRegressor(n_estimators=n_estimators, max_depth=-1, num_leaves=16, learning_rate=.05, device='gpu')

In [55]:
%%time
clf.fit(train_X, train_y, early_stopping_rounds=40, eval_set=eval_set, eval_metric='rmse', verbose=10, categorical_feature=['param_1','region','parent_category_name','category_name'])



Training until validation scores don't improve for 40 rounds.
[10]	training's rmse: 0.232567
[20]	training's rmse: 0.221564
[30]	training's rmse: 0.217291
[40]	training's rmse: 0.215604
[50]	training's rmse: 0.214908
[60]	training's rmse: 0.214597
[70]	training's rmse: 0.21444
[80]	training's rmse: 0.214349
[90]	training's rmse: 0.214284
[100]	training's rmse: 0.214236
[110]	training's rmse: 0.214192
[120]	training's rmse: 0.214148
[130]	training's rmse: 0.214111
[140]	training's rmse: 0.214078
[150]	training's rmse: 0.214047
[160]	training's rmse: 0.214014
[170]	training's rmse: 0.213985
[180]	training's rmse: 0.213955
[190]	training's rmse: 0.213928
[200]	training's rmse: 0.213896
[210]	training's rmse: 0.21387
[220]	training's rmse: 0.213837
[230]	training's rmse: 0.213807
[240]	training's rmse: 0.213782
[250]	training's rmse: 0.213754
[260]	training's rmse: 0.213729
[270]	training's rmse: 0.213706
[280]	training's rmse: 0.213684
[290]	training's rmse: 0.213659
[300]	training's rmse

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       device='gpu', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=500, n_jobs=-1, num_leaves=16, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1)

In [58]:
test_df['deal_probability'] = clf.predict(test_df[cols] )
test_df['deal_probability'] = np.clip(test_df['deal_probability'], .0001, .9999)
test_df[['item_id', 'deal_probability']].to_csv('../lgbbsub_3005L2.csv.gz', compression='gzip', index=False, header=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [57]:
test_df[['item_id', 'deal_probability']].head()

Unnamed: 0,item_id,deal_probability
1503424,6544e41a8817,0.403892
1503425,65b9484d670f,0.160688
1503426,8bab230b2ecd,0.136144
1503427,8e348601fefc,0.061317
1503428,8bd2fe400b89,0.28688
