In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor

data_path = 'data/'

#lgb25 = pd.read_csv('../lgCV_2505.csv.gz', compression='gzip')
lgb27 = pd.read_csv('../lgCV_2705B.csv.gz', compression='gzip')
rnn = pd.read_csv('../rnnCV_2805.csv.gz', compression='gzip')
mlp = pd.read_csv('../mlpCV_2505.csv.gz', compression='gzip')
truth = pd.read_csv('../../%strain.csv.zip'%(data_path), compression='zip')
y = truth['deal_probability'].values
truth.drop('deal_probability', 1)
test =  pd.read_csv('../../%stest.csv.zip'%(data_path), compression='zip')
test['deal_probability']=float('NAN') 
truth = pd.concat([truth,test[truth.columns]],axis=0)

In [2]:
#lgb25.rename(columns={'deal_probability': 'lgb25_preds' }, inplace=True)
lgb27.rename(columns={'deal_probability': 'lgb27_preds' }, inplace=True)
mlp.rename(columns={'deal_probability': 'mlp_preds' }, inplace=True)
preds_df = lgb27.merge(rnn, on='item_id')\
                .merge(mlp, on='item_id')\
                .merge(truth, on='item_id',how='left')

In [3]:
preds_df['difference_l27_r']=preds_df['lgb27_preds'] - preds_df['rnn_preds']
preds_df['difference_l27_m']=preds_df['lgb27_preds'] - preds_df['mlp_preds']
#preds_df['difference_l27_l25']=preds_df['lgb27_preds'] - preds_df['lgb25_preds']
preds_df['price'].fillna(-1,inplace=True)

In [4]:
idx = preds_df['deal_probability']==preds_df['deal_probability']
print(idx.value_counts())
for col in [c for c in preds_df.columns if '_preds' in c]:
    print('RMSE %s: '%(col), np.sqrt(metrics.mean_squared_error(preds_df['deal_probability'][idx].values, preds_df[col][idx].values)))

True     1503424
False     508438
Name: deal_probability, dtype: int64
RMSE lgb27_preds:  0.21681527582821059
RMSE rnn_preds:  0.21771476573780568
RMSE mlp_preds:  0.2187544351068867


In [6]:
cols = ['lgb27_preds', 'rnn_preds','mlp_preds','price', \
        'difference_l27_r', 'difference_l27_m', \
        'region', 'param_1', 'parent_category_name', 'category_name' ]
for col in ['region', 'param_1', 'parent_category_name', 'category_name']:
    preds_df[col] = LabelEncoder().fit_transform(preds_df[col].fillna("0"))
train_df = preds_df[~preds_df['deal_probability'].isnull()]
test_df = preds_df[preds_df['deal_probability'].isnull()]

In [14]:
full=True
if full:
    n_estimators = 1326
    train_X = train_df[cols]
    train_y = y
    eval_set = [(train_X,train_y)]
else:
    n_estimators = 4000
    train_X, valid_X, train_y, valid_y = train_test_split(train_df[cols], y, train_size=.8, random_state=12345)
    eval_set = [(train_X,train_y),(valid_X,valid_y)]

In [15]:
len(train_X)

1503424

In [16]:
clf = LGBMRegressor(n_estimators=n_estimators, 
                    max_depth=-1, 
                    feature_fraction= 0.5,
                    num_leaves=32, 
                    learning_rate=.02)#, device='gpu')

In [17]:
%%time
clf.fit(train_X, train_y, early_stopping_rounds=80, 
        eval_set=eval_set, eval_metric='rmse', verbose=10, 
        categorical_feature=['param_1','region','parent_category_name','category_name'])



Training until validation scores don't improve for 80 rounds.
[10]	training's rmse: 0.246228
[20]	training's rmse: 0.236839
[30]	training's rmse: 0.230417
[40]	training's rmse: 0.225403
[50]	training's rmse: 0.221917
[60]	training's rmse: 0.219639
[70]	training's rmse: 0.217957
[80]	training's rmse: 0.216843
[90]	training's rmse: 0.216015
[100]	training's rmse: 0.215493
[110]	training's rmse: 0.215092
[120]	training's rmse: 0.214796
[130]	training's rmse: 0.214576
[140]	training's rmse: 0.214408
[150]	training's rmse: 0.214283
[160]	training's rmse: 0.214189
[170]	training's rmse: 0.214111
[180]	training's rmse: 0.214055
[190]	training's rmse: 0.214
[200]	training's rmse: 0.21396
[210]	training's rmse: 0.21392
[220]	training's rmse: 0.213885
[230]	training's rmse: 0.213855
[240]	training's rmse: 0.213828
[250]	training's rmse: 0.213798
[260]	training's rmse: 0.213774
[270]	training's rmse: 0.213751
[280]	training's rmse: 0.213728
[290]	training's rmse: 0.213708
[300]	training's rmse: 0

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       feature_fraction=0.5, learning_rate=0.02, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=1326, n_jobs=-1, num_leaves=32, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1)

In [18]:
test_df['deal_probability'] = clf.predict(test_df[cols] )
test_df['deal_probability'] = np.clip(test_df['deal_probability'], .0001, .9999)
test_df[['item_id', 'deal_probability']].to_csv('../lgbbsub_3105AL2.csv.gz', compression='gzip', index=False, header=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
test_df[['item_id', 'deal_probability']].head()