In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')


In [2]:
df_train = pd.read_csv('/root/data/train.csv')
df_test = pd.read_csv('/root/data/test.csv')
df_hist_trans = pd.read_csv('/root/data/historical_transactions.csv')
df_new_merchant_trans = pd.read_csv('/root/data/new_merchant_transactions.csv')

In [3]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [4]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [5]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

In [6]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_hist_trans[col+'_mean'] = df_hist_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']    

new_columns = get_new_columns('hist',aggs)
df_hist_trans_group = df_hist_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['hist_purchase_date_diff'] = (df_hist_trans_group['hist_purchase_date_max'] - df_hist_trans_group['hist_purchase_date_min']).dt.days
df_hist_trans_group['hist_purchase_date_average'] = df_hist_trans_group['hist_purchase_date_diff']/df_hist_trans_group['hist_card_id_size']
df_hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

42

In [7]:


aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_new_merchant_trans[col+'_mean'] = df_new_merchant_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']
    
new_columns = get_new_columns('new_hist',aggs)
df_hist_trans_group = df_new_merchant_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['new_hist_purchase_date_diff'] = (df_hist_trans_group['new_hist_purchase_date_max'] - df_hist_trans_group['new_hist_purchase_date_min']).dt.days
df_hist_trans_group['new_hist_purchase_date_average'] = df_hist_trans_group['new_hist_purchase_date_diff']/df_hist_trans_group['new_hist_card_id_size']
df_hist_trans_group['new_hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['new_hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()



28

In [8]:
del df_hist_trans;gc.collect()
del df_new_merchant_trans;gc.collect()
df_train.head(5)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,hist_month_nunique,hist_hour_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,...,new_hist_weekend_sum,new_hist_weekend_mean,new_hist_category_1_sum,new_hist_category_1_mean,new_hist_card_id_size,new_hist_category_2_mean_mean,new_hist_category_3_mean_mean,new_hist_purchase_date_diff,new_hist_purchase_date_average,new_hist_purchase_date_uptonow
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283,9,23,35,7,...,6.0,0.26087,0.0,0.0,23.0,-0.55016,-0.592993,54.0,2.347826,249.0
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913,12,24,50,7,...,0.0,0.0,0.0,0.0,6.0,-0.55016,-0.606486,56.0,9.333333,279.0
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056,10,14,22,7,...,1.0,1.0,0.0,0.0,1.0,-0.549015,-0.592993,0.0,0.0,250.0
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495,6,16,20,7,...,3.0,0.428571,1.0,0.142857,7.0,-0.556518,-0.604559,41.0,5.857143,260.0
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749,4,22,17,7,...,12.0,0.333333,2.0,0.055556,36.0,-0.555446,-0.588217,57.0,1.583333,250.0


In [9]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
df_train['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [10]:
for df in [df_train,df_test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_hist_first_buy'] = (df['new_hist_purchase_date_min'] - df['first_active_month']).dt.days
    for f in ['hist_purchase_date_max','hist_purchase_date_min','new_hist_purchase_date_max',\
                     'new_hist_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9
    df['card_id_total'] = df['new_hist_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_hist_purchase_amount_sum']+df['hist_purchase_amount_sum']

for f in ['feature_1','feature_2','feature_3']:                        
    order_label = df_train.groupby([f])['outliers'].mean()
    df_train[f] = df_train[f].map(order_label)
    df_test[f] = df_test[f].map(order_label)



***

In [37]:
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]

In [59]:
def regression_cv(param, data, cv):
    score = lgb.cv( 
        param, 
        data, 
        nfold=cv,
        stratified=False, 
        shuffle=True,
        metrics='rmse',
        early_stopping_rounds=50,
        verbose_eval=False, 
        show_stdv=False)
    return score['rmse-mean'][-1]


def hyperopt_train_test(params):
    global df_train
    
    param = {
        'objective':'regression',
        "boosting": "gbdt",
    }
    param['max_depth'] = int(params['max_depth'])
    param['num_leaves'] = int(params['num_leaves'])
    param['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    param['reg_alpha'] = params['reg_alpha']
    param['reg_lambda'] = params['reg_lambda']
    param['feature_fraction'] = params['feature_fraction']
    
    df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
    data_train = lgb.Dataset(df_train[df_train_columns], label=df_train['target'])
    return regression_cv(param, data_train, cv=5)


def f(params):
    global best
    score = hyperopt_train_test(params)
    if -score > best:
        best = -score
        print('new best:', -best, params)
    return {'loss': score, 'status': STATUS_OK}

In [46]:
space4rf = {
    'max_depth': hp.quniform('max_depth', 5, 13, 1),
    'num_leaves':hp.quniform('num_leaves', 10, 350, 1),
    'min_data_in_leaf':hp.quniform('min_data_in_leaf', 10, 350, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.5),   
    'reg_lambda': hp.uniform('reg_lambda', 0, 0.5),
    'feature_fraction' : hp.uniform('feature_fraction', 0.5, 1)}

best = -15
trials = Trials()
best = fmin(f, space4rf, algo=tpe.suggest, max_evals=900, trials=trials)

print('best:')
print(best)

new best: 3.659868101823613 {'feature_fraction': 0.7801719990248783, 'max_depth': 12.0, 'min_data_in_leaf': 308.0, 'num_leaves': 79.0, 'reg_alpha': 0.002616237891337081, 'reg_lambda': 0.44173487967164776}
new best: 3.658105651811879 {'feature_fraction': 0.9903078304451878, 'max_depth': 7.0, 'min_data_in_leaf': 226.0, 'num_leaves': 178.0, 'reg_alpha': 0.3357829881806267, 'reg_lambda': 0.2943760133337211}
new best: 3.6576589270306386 {'feature_fraction': 0.8186974015821651, 'max_depth': 6.0, 'min_data_in_leaf': 294.0, 'num_leaves': 195.0, 'reg_alpha': 0.031306762618277195, 'reg_lambda': 0.2917369661750133}
new best: 3.657123425067369 {'feature_fraction': 0.9563058211301799, 'max_depth': 6.0, 'min_data_in_leaf': 293.0, 'num_leaves': 277.0, 'reg_alpha': 0.14544787152042116, 'reg_lambda': 0.1801851271059528}
new best: 3.6570764530426474 {'feature_fraction': 0.8854330611486116, 'max_depth': 7.0, 'min_data_in_leaf': 309.0, 'num_leaves': 206.0, 'reg_alpha': 0.022647609388158026, 'reg_lambda': 

In [26]:
param1 = {
    'feature_fraction': 0.5775349282300533, 
    'max_depth': 6,
    'min_data_in_leaf': 350,
    'num_leaves': 243,
    'reg_alpha': 0.08335604635874802,
    "metric": 'rmse',  
    "boosting": "gbdt",
    'objective':'regression',
    'learning_rate': 0.001,
    'reg_lambda': 0.14749148707496232
}

folds = StratifiedKFold(n_splits=10, shuffle=True)
oof = np.zeros(x_train.shape[0])
predictions = np.zeros(x_test.shape[0])
feature_importance_df = pd.DataFrame()
outlier = np.array([1 if i<-30 else 0 for i in y_train])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, outlier)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(x_train[trn_idx], label=y_train[trn_idx])
    val_data = lgb.Dataset(x_train[val_idx], label=y_train[val_idx])

    num_round = 30000
    clf = lgb.train(param1, 
                    trn_data,
                    num_round, 
                    valid_sets = [trn_data, val_data],
                    verbose_eval=1000,
                    early_stopping_rounds = 1000)
    
    oof[val_idx] = clf.predict(x_train[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = feature_col
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, y_train))

fold 0
Training until validation scores don't improve for 1000 rounds.
[1000]	training's rmse: 3.69047	valid_1's rmse: 3.69715
[2000]	training's rmse: 3.64254	valid_1's rmse: 3.65733
[3000]	training's rmse: 3.61914	valid_1's rmse: 3.64188
[4000]	training's rmse: 3.60414	valid_1's rmse: 3.63665
[5000]	training's rmse: 3.5936	valid_1's rmse: 3.63315
[6000]	training's rmse: 3.58409	valid_1's rmse: 3.63119
[7000]	training's rmse: 3.57718	valid_1's rmse: 3.62971
[8000]	training's rmse: 3.57079	valid_1's rmse: 3.6285
[9000]	training's rmse: 3.56477	valid_1's rmse: 3.62748
[10000]	training's rmse: 3.5586	valid_1's rmse: 3.62673
[11000]	training's rmse: 3.55243	valid_1's rmse: 3.62611
[12000]	training's rmse: 3.54686	valid_1's rmse: 3.6256
[13000]	training's rmse: 3.54119	valid_1's rmse: 3.62528
[14000]	training's rmse: 3.53565	valid_1's rmse: 3.62496
[15000]	training's rmse: 3.52986	valid_1's rmse: 3.62487
[16000]	training's rmse: 3.52443	valid_1's rmse: 3.62496
Early stopping, best iteration

[6000]	training's rmse: 3.58271	valid_1's rmse: 3.64267
[7000]	training's rmse: 3.57476	valid_1's rmse: 3.64227
[8000]	training's rmse: 3.56737	valid_1's rmse: 3.64257
Early stopping, best iteration is:
[7070]	training's rmse: 3.57424	valid_1's rmse: 3.64227
fold 9
Training until validation scores don't improve for 1000 rounds.
[1000]	training's rmse: 3.68837	valid_1's rmse: 3.70526
[2000]	training's rmse: 3.63881	valid_1's rmse: 3.66927
[3000]	training's rmse: 3.61408	valid_1's rmse: 3.65565
[4000]	training's rmse: 3.59859	valid_1's rmse: 3.65043
[5000]	training's rmse: 3.58734	valid_1's rmse: 3.64735
[6000]	training's rmse: 3.57868	valid_1's rmse: 3.64522
[7000]	training's rmse: 3.57164	valid_1's rmse: 3.64392
[8000]	training's rmse: 3.56478	valid_1's rmse: 3.64324
[9000]	training's rmse: 3.55838	valid_1's rmse: 3.643
[10000]	training's rmse: 3.55206	valid_1's rmse: 3.64283
[11000]	training's rmse: 3.5463	valid_1's rmse: 3.64291
Early stopping, best iteration is:
[10253]	training's r

3.6503367743055093

In [61]:
param = {
    'feature_fraction': [0.5775349282300533], 
    'max_depth': [6],
    'min_data_in_leaf': [350],
    'num_leaves': [243],
    'reg_alpha': [0.08335604635874802],
    'learning_rate': [0.001],
    'reg_lambda': [0.14749148707496232],
    'num_iterations':[20000]
}
lgb_model1 = lgb.LGBMRegressor(objective='regression',
                              min_data_in_leaf=20,
                              num_leaves=50,
                              num_iterations=100,
                              learning_rate=0.1, 
                              max_depth=8,
                              num_threads=8,
                              feature_fraction = 0.8)

gsearch11 = GridSearchCV(estimator=lgb_model1, 
                        param_grid=param,
                        scoring='neg_mean_squared_error', 
                        cv=10,
                        verbose=1, 
                        n_jobs=1)

gs11 = gsearch11.fit(df_train[df_train_columns], df_train['target'])

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 57.1min finished


In [64]:
y_test = gs11.predict(df_test[df_train_columns].values)

In [66]:
submission = pd.DataFrame({'card_id':df_test.card_id, 'target': y_test})
submission.to_csv('/root/tempfile/moban_sklean10.csv',index=False)

***
***
***
***
***

In [11]:
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
target = df_train['target']
del df_train['target']

In [15]:

param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4}
folds = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, target))



fold 0
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.6627	valid_1's rmse: 3.73334
[200]	training's rmse: 3.58448	valid_1's rmse: 3.70258
[300]	training's rmse: 3.53791	valid_1's rmse: 3.68952
[400]	training's rmse: 3.502	valid_1's rmse: 3.68205
[500]	training's rmse: 3.47195	valid_1's rmse: 3.67658
[600]	training's rmse: 3.44625	valid_1's rmse: 3.67335
[700]	training's rmse: 3.42374	valid_1's rmse: 3.67124
[800]	training's rmse: 3.40368	valid_1's rmse: 3.66988
[900]	training's rmse: 3.38548	valid_1's rmse: 3.66849
[1000]	training's rmse: 3.36909	valid_1's rmse: 3.66806
[1100]	training's rmse: 3.35212	valid_1's rmse: 3.6684
Early stopping, best iteration is:
[1026]	training's rmse: 3.36479	valid_1's rmse: 3.66783
fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.67146	valid_1's rmse: 3.70281
[200]	training's rmse: 3.59637	valid_1's rmse: 3.66845
[300]	training's rmse: 3.55039	valid_1's rmse: 3.65472
[

3.6539629368389783

In [None]:
submission = pd.DataFrame({'card_id':testindex, 'target':predictions})
submission.to_csv('/root/tempfile/submission19121633.csv',index=False)