## P.S. The main idea behind this notebook is inspired from FabienDaniel Kernel Elo_world.
https://www.kaggle.com/fabiendaniel/elo-world

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge
warnings.simplefilter(action='ignore', category=FutureWarning)
import gc
import pickle
from sklearn.base import BaseEstimator
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import log_loss
from sklearn.model_selection import RepeatedKFold
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# new_transactions = pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv', parse_dates=['purchase_date'])
# historical_transactions = pd.read_csv('../input/elo-merchant-category-recommendation/historical_transactions.csv', parse_dates=['purchase_date'])

historical_transactions = pd.read_parquet('../input/hist_trans_df.parquet.gzip')
new_transactions = pd.read_parquet('../input/new_trans_df.parquet.gzip')

def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [None]:
%%time
# def read_data(input_file):
#     df = pd.read_csv(input_file)
#     df['first_active_month'] = pd.to_datetime(df['first_active_month'])
#     df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
#     return df

# train = read_data('../input/elo-merchant-category-recommendation/train.csv')
# test = read_data('../input/elo-merchant-category-recommendation/test.csv')

def read_data_v2(input_file):
    df = pd.read_parquet(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
    return df

train = read_data_v2('../input/train_df.parquet.gzip')
test = read_data_v2('../input/test_df.parquet.gzip')

target = train['target']
del train['target']
gc.collect()

## **Feature Engineering**

In [None]:
plot_opt = 0

In [None]:
historical_transactions["category_1"] += 1
historical_transactions["category_1"].unique()

In [None]:
new_transactions["category_1"] += 1
new_transactions["category_1"].unique()

In [None]:
historical_transactions["category_2"].unique()

In [None]:
historical_transactions["category_2"] = historical_transactions["category_2"].fillna(0)
historical_transactions["category_2"].unique()

In [None]:
new_transactions["category_2"].unique()

In [None]:
new_transactions["category_2"] = new_transactions["category_2"].fillna(0)
new_transactions["category_2"].unique()

In [None]:
historical_transactions["category_3"].unique()

In [None]:
historical_transactions["category_3"] = historical_transactions["category_3"].replace({'A': 1, 'B': 2, 'C': 3, None: 0})
historical_transactions["category_3"].unique()

In [None]:
new_transactions["category_3"].unique()

In [None]:
new_transactions["category_3"] = new_transactions["category_3"].replace({'A': 1, 'B': 2, 'C': 3, None: 0})
new_transactions["category_3"].unique()

In [None]:
# historical_transactions["category_1_2_cross"] = historical_transactions["category_1"]*2 + historical_transactions["category_2"]
# np.sort(historical_transactions["category_1_2_cross"].unique())

In [None]:
# %%time
# if plot_opt:
#     plt.scatter(new_transactions["category_1"], new_transactions["category_2"])
#     plt.xlabel("category_1")
#     plt.ylabel("category_2")
#     plt.show()

In [None]:
# new_transactions["category_1_2_cross"] = new_transactions["category_1"]*2 + new_transactions["category_2"]
# np.sort(new_transactions["category_1_2_cross"].unique())

In [None]:
# %%time
# if plot_opt:
#     plt.scatter(historical_transactions["category_1"]+2, historical_transactions["category_3"])
#     plt.xlabel("category_1")
#     plt.ylabel("category_3")
#     plt.show()

In [None]:
# historical_transactions["category_1_3_cross"] = (historical_transactions["category_1"]+2) * historical_transactions["category_3"]
# np.sort(historical_transactions["category_1_3_cross"].unique())

In [None]:
# %%time
# if plot_opt:
#     plt.scatter(new_transactions["category_1"], new_transactions["category_3"])
#     plt.xlabel("category_1")
#     plt.ylabel("category_3")
#     plt.show()

In [None]:
# new_transactions["category_1_3_cross"] = (new_transactions["category_1"]+2) * new_transactions["category_3"]
# np.sort(new_transactions["category_1_3_cross"].unique())

In [None]:
# %%time
# if plot_opt:
#     plt.scatter(historical_transactions["category_2"]+6, historical_transactions["category_3"])
#     plt.xlabel("category_2")
#     plt.ylabel("category_3")
#     plt.show()

In [None]:
# historical_transactions["category_2_3_cross"] = (historical_transactions["category_2"]+6) * historical_transactions["category_3"]
# np.sort(historical_transactions["category_2_3_cross"].unique())

In [None]:
# %%time
# if plot_opt:
#     plt.scatter(new_transactions["category_2"]+6, new_transactions["category_3"])
#     plt.xlabel("category_2")
#     plt.ylabel("category_3")
#     plt.show()

In [None]:
# new_transactions["category_2_3_cross"] = (new_transactions["category_2"]+6) * new_transactions["category_3"]
# np.sort(new_transactions["category_2_3_cross"].unique())

In [None]:
%%time

def category_make_cross_feat(df):
    df["category_1_3_cross"] = (df["category_1"]+2) * df["category_3"]
    df["category_2_3_cross"] = (df["category_2"]+6) * df["category_3"]
    return df

historical_transactions = category_make_cross_feat(historical_transactions)
new_transactions = category_make_cross_feat(new_transactions)

In [None]:
%%time
historical_transactions = pd.get_dummies(historical_transactions, columns=['category_2', 
                                                                           'category_3',
                                                                           'category_1_3_cross',
                                                                           'category_2_3_cross'])
new_transactions = pd.get_dummies(new_transactions, columns=['category_2', 
                                                             'category_3',
                                                             'category_1_3_cross',
                                                             'category_2_3_cross'])

historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

agg_fun = {'authorized_flag': ['sum', 'mean', 'min', 'std', 'count']} # max is all 1's, useless
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]
gc.collect()

In [None]:
auth_mean.head()

In [None]:
%%time
historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
authorized_transactions['purchase_month'] = authorized_transactions['purchase_date'].dt.month
new_transactions['purchase_month'] = new_transactions['purchase_date'].dt.month
gc.collect()

In [None]:
new_transactions.head()

In [None]:
%%time
def aggregate_transactions(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_1': ['mean'],
        'category_3_2': ['mean'],
        'category_3_3': ['mean'],
        'category_1_3_cross_3': ['mean'],
        'category_1_3_cross_4': ['mean'],
        'category_1_3_cross_6': ['mean'],
        'category_1_3_cross_8': ['mean'],
        'category_1_3_cross_9': ['mean'],
        'category_1_3_cross_12': ['mean'],
        'category_2_3_cross_7.0': ['mean'],
        'category_2_3_cross_8.0': ['mean'],
        'category_2_3_cross_9.0': ['mean'],
        'category_2_3_cross_10.0': ['mean'],
        'category_2_3_cross_11.0': ['mean'],
        'category_2_3_cross_12.0': ['mean'],
        'category_2_3_cross_14.0': ['mean'],
        'category_2_3_cross_16.0': ['mean'],
        'category_2_3_cross_20.0': ['mean'],
        'category_2_3_cross_21.0': ['mean'],
        'category_2_3_cross_22.0': ['mean'],
        'category_2_3_cross_24.0': ['mean'],
        'category_2_3_cross_27.0': ['mean'],
        'category_2_3_cross_30.0': ['mean'],
        'category_2_3_cross_33.0': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std', 'count'], #one count is enough, others are just the same
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['min', 'max']
        }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

gc.collect()

In [None]:
%%time
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]
history[:5]
gc.collect()

In [None]:
# # history[["hist_purchase_amount_count", "hist_installments_count", "hist_purchase_month_count"]].head(500)
# history.head(500)

In [None]:
%%time
authorized = aggregate_transactions(authorized_transactions)
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]
authorized[:5]
gc.collect()

In [None]:
%%time
new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]
new[:5]
gc.collect()

In [None]:
%%time
def aggregate_per_month(history):
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
#             'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
#             'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            'purchase_amount': ['count', 'sum'],
            'installments': ['count', 'sum'],
            }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)
    
    return final_group
#___________________________________________________________
final_group =  aggregate_per_month(historical_transactions) 
final_group[:10]
gc.collect()

In [None]:
%%time
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, authorized, on='card_id', how='left')
test = pd.merge(test, authorized, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

train = pd.merge(train, final_group, on='card_id', how='left')
test = pd.merge(test, final_group, on='card_id', how='left')

train = pd.merge(train, auth_mean, on='card_id', how='left')
test = pd.merge(test, auth_mean, on='card_id', how='left')

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)
gc.collect()

In [None]:
def generate_feat(df):
    # Feature crosses
    df["feature_1_2_cross"] = df["feature_1"] + (df["feature_2"]-1)*5
    df["feature_1_3_cross"] = df["feature_1"] + df["feature_3"]*3
    df["feature_2_3_cross"] = df["feature_2"] + df["feature_3"]*3
    df = pd.get_dummies(df, columns=["feature_1_2_cross", "feature_1_3_cross", "feature_2_3_cross"])
    
    return df

train = generate_feat(train)
test = generate_feat(test)

In [None]:
def power_2_3_feat(df, feat_list):
    for feat in feat_list:
        df[feat+"_power2"] = df[feat]**2
#         df[feat+"_power3"] = df[feat]**3
    return df

feat_list = ["elapsed_time", "hist_purchase_date_ptp"]
train = power_2_3_feat(train, feat_list)
test = power_2_3_feat(test, feat_list)

In [None]:
def log_feat(df, feat_list):
    for feat in feat_list:
        df[feat+"_log"] = np.log(df[feat])
#         df[feat+"_power3"] = df[feat]**3
    return df

feat_list = ["elapsed_time"]
train = log_feat(train, feat_list)
test = log_feat(test, feat_list)

In [None]:
corrmat = train.corr()

In [None]:
f, ax = plt.subplots(figsize=(100, 100))
sns.heatmap(corrmat, vmax=1.0, square=True)

In [None]:
f.savefig("../img/corr.png")

In [None]:
remove_feat_list = ['authorized_flag_min', 
                    'hist_purchase_amount_sum',
                    'hist_purchase_amount_max',
                    'hist_purchase_amount_min',
                    'hist_purchase_amount_std',
                    'hist_installments_sum',
                    'hist_installments_max',
                    'hist_installments_min',
                    'hist_installments_std',
                   ]
train = train.drop(remove_feat_list, axis=1)
test = test.drop(remove_feat_list, axis=1)

In [None]:
corrmat = train.corr()

In [None]:
f, ax = plt.subplots(figsize=(100, 100))
sns.heatmap(corrmat, vmax=1.0, square=True)

In [None]:
f.savefig("../img/corr_after_feat_removal.png")

In [None]:
# with open('../input/feat_list.pkl', 'rb') as f:
#     feat_list = pickle.load(f)

# train = train[feat_list]
# test = test[feat_list]

#### save train, test

In [None]:
with open('../input/train_test_target.pkl', 'wb') as f:
    pickle.dump([train, target, test], f)

In [None]:
with open('../input/train_test_target.pkl', 'rb') as f:
    [train, target, test] = pickle.load(f)

In [None]:
# https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
# saleprice correlation matrix

# k = train.shape[1] #number of variables for heatmap
# cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
# cm = np.corrcoef(df_train[cols].values.T)
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()

In [None]:
# #scatterplot
# sns.set()
# # cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
# # sns.pairplot(df_train[cols], size = 2.5)
# sns.pairplot(train, size = 2.5)
# plt.show();

In [None]:
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

In [None]:
train = train.fillna(0)
test = test.fillna(0)

### Outlier removal

In [None]:
plt.figure(figsize=(12,160))
sns.distplot(target.values, bins=50, kde=False, color='blue')
plt.title('Histogram of Loyalty Score before removal')
plt.xlabel('Loyalty score', fontsize=12)
plt.show()

In [None]:
from scipy.stats import norm
from scipy import stats
sns.distplot(target, fit=norm)
fig = plt.figure()
res = stats.probplot(target, plot=plt)

In [None]:
#skewness and kurtosis
print("Skewness: %f" % pd.DataFrame(target).skew())
print("Kurtosis: %f" % pd.DataFrame(target).kurt())

In [None]:
min_target = min(value for value in target if value > -20)
min_target

In [None]:
idx_between_20_30 = [value for value in target if value >= -30 and value <=-20]
len(idx_between_20_30)

In [None]:
idx_lessThan_30 = [value for value in target if value < -30]
len(idx_lessThan_30)

In [None]:
# https://www.kaggle.com/nottold/naive-ensemble-model-ridge-lasso
class OutlierDetection(BaseEstimator):
    def __init__(self, alpha, dims, std, mean, median):
        self.alpha = alpha
        self.dims = dims
        self.std = std
        self.mean = mean
        self.median = median
    def fit(self, X):
        # std, mean, median = X.std(), X.mean(), X.median()
        X["outliers"] = 0
        for col in X.columns:
#             print(col)
            if not col == "outliers":
                # outlier_idx = (abs(X[col]) > (self.alpha * std[col] + mean[col]))
                outlier_idx = (np.abs(X[col]) > (self.alpha * self.std[col] + self.mean[col]))
                X.set_value(outlier_idx, "outliers", X[outlier_idx]["outliers"] + 1)
        outliers = X[X["outliers"] > self.dims]
        X.drop("outliers", axis=1, inplace=True)
        outlier_idx = outliers.index.tolist()
        # return outliers.index
        return set(list(range(X.shape[0]))) - set(outlier_idx), outlier_idx

In [None]:
target_df = pd.DataFrame(target)

outlier_removal = OutlierDetection(alpha=3, 
                                   dims=0, 
                                   std=target_df.std().astype('float'), 
                                   mean=target_df.mean().astype('float'), 
                                   median=target_df.median().astype('float'))
normal_idx, outlier_idx = outlier_removal.fit(target_df)
# samples = target_df.shape[0] - len(outlier)
# xtrain = xtrain.drop(outlier_index).reset_index(drop=True)
# y = y.drop(outlier_index).reset_index(drop=True)

In [None]:
train["outliers"] = 0
train.at[outlier_idx, "outliers"] = 1

In [None]:
train["outliers"].unique()

In [None]:
with open('../input/train_test_target_with_target.pkl', 'wb') as f:
    pickle.dump([train, target, test, normal_idx, outlier_idx], f)

In [2]:
with open('../input/train_test_target_with_target.pkl', 'rb') as f:
    [train, target, test, normal_idx, outlier_idx] = pickle.load(f)

## Step 1: Training Model Without Outliers for 5 fold (n_repeats = 1)

In [3]:
df_train = train[train['outliers'] == 0]
df_target = target[normal_idx]
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]

In [4]:
df_train.shape, df_target.shape

((199644, 212), (199644,))

In [5]:
def lgbm_regression_train(train, target, test, param, features, categorical_feats):
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    start = time.time()
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
        val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))
    return predictions, oof

In [6]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01, #default: 0.005 (3.66940)   /   0.005(3.67032), 0.01 (3.67152), 0.05 ()
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": -1,
         "verbosity": -1}

normal_predictions, oof = lgbm_regression_train(df_train, 
                                                df_target, 
                                                test, 
                                                param, 
                                                features, 
                                                categorical_feats)

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.5885	valid_1's rmse: 1.6088
[200]	training's rmse: 1.55768	valid_1's rmse: 1.58216
[300]	training's rmse: 1.54258	valid_1's rmse: 1.57098
[400]	training's rmse: 1.53261	valid_1's rmse: 1.56486
[500]	training's rmse: 1.52505	valid_1's rmse: 1.56149
[600]	training's rmse: 1.51892	valid_1's rmse: 1.55916
[700]	training's rmse: 1.51377	valid_1's rmse: 1.5577
[800]	training's rmse: 1.50921	valid_1's rmse: 1.55672
[900]	training's rmse: 1.50498	valid_1's rmse: 1.55607
[1000]	training's rmse: 1.50102	valid_1's rmse: 1.55558
[1100]	training's rmse: 1.49723	valid_1's rmse: 1.55517
[1200]	training's rmse: 1.49361	valid_1's rmse: 1.5549
[1300]	training's rmse: 1.49009	valid_1's rmse: 1.55471
[1400]	training's rmse: 1.48662	valid_1's rmse: 1.55447
[1500]	training's rmse: 1.48328	valid_1's rmse: 1.55427
[1600]	training's rmse: 1.47998	valid_1's rmse: 1.55411
[1700]	training's rmse: 1.47671	valid_1's rmse: 1.5540

[2100]	training's rmse: 1.47022	valid_1's rmse: 1.52895
[2200]	training's rmse: 1.46725	valid_1's rmse: 1.529
[2300]	training's rmse: 1.46424	valid_1's rmse: 1.52885
[2400]	training's rmse: 1.46125	valid_1's rmse: 1.52874
[2500]	training's rmse: 1.45826	valid_1's rmse: 1.52871
[2600]	training's rmse: 1.45523	valid_1's rmse: 1.52875
[2700]	training's rmse: 1.45225	valid_1's rmse: 1.52875
Early stopping, best iteration is:
[2522]	training's rmse: 1.45755	valid_1's rmse: 1.5287
CV score: 1.54092 


In [7]:
model_without_outliers = pd.DataFrame({"card_id":test["card_id"].values})
model_without_outliers["target"] = normal_predictions

## Part 2: Training Model For Outliers Classification for 5 fold (n_repeats = 1)

In [58]:
def lgbm_classification_train(df_train, target, df_test, param, features, categorical_feats):
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(df_train))
    predictions = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    start = time.time()
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
        val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits
        
    print("CV score: {:<8.5f}".format(log_loss(target, oof)))

    return predictions, oof

In [59]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

with open('../input/train_test_target_with_target.pkl', 'rb') as f:
    [train, target, test, normal_idx, outlier_idx] = pickle.load(f)

target = train['outliers']
del train['outliers']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]
outlier_label, oof_class = lgbm_classification_train(train, target, test, param, features, categorical_feats)

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0462539	valid_1's binary_logloss: 0.0483859
[200]	training's binary_logloss: 0.0462506	valid_1's binary_logloss: 0.0483834
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.0462377	valid_1's binary_logloss: 0.0482944
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0466662	valid_1's binary_logloss: 0.0470919
[200]	training's binary_logloss: 0.0466763	valid_1's binary_logloss: 0.047128
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.0466228	valid_1's binary_logloss: 0.0470608
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0465256	valid_1's binary_logloss: 0.0466832
[200]	training's binary_logloss: 0.0464995	valid_1's binary_logloss: 0.0466729
[300]	training's binary_logloss: 0.0465142	valid_1's binary_logloss: 0.0466902
Early stopping,

In [10]:
df_outlier_prob = pd.DataFrame({"card_id":test["card_id"].values})
df_outlier_prob["target"] = outlier_label
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.022946
1,C_ID_130fd0cbdd,0.001735
2,C_ID_b709037bc5,0.010461
3,C_ID_d27d835a9f,0.001735
4,C_ID_2b5e3df5c2,0.001735


## Part 3: Combining Submission for 5 fold (n_repeats = 1)

In [11]:
outlier_id = pd.DataFrame(\
                          df_outlier_prob.sort_values(by='target',
                                                      ascending = False)
                          .head(25000)['card_id'])

In [12]:
best_submission = pd.read_csv('../result/Blend2_v2.csv')

In [13]:
print(best_submission.shape[0])
best_submission.head()

123623


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.346967
1,C_ID_130fd0cbdd,-0.35402
2,C_ID_b709037bc5,-0.932773
3,C_ID_d27d835a9f,-0.148607
4,C_ID_2b5e3df5c2,-1.090599


In [14]:
print(outlier_id.shape[0])
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()

25000


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.346967
1,C_ID_6d8dba8475,-0.881375
2,C_ID_4859ac9ed5,-0.641561
3,C_ID_7f1041e8e1,-5.193301
4,C_ID_22e4a47c72,0.341024


In [15]:
%%time
for card_id in most_likely_liers['card_id']:
    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target']\
    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

CPU times: user 4min 16s, sys: 20.2 ms, total: 4min 16s
Wall time: 4min 16s


In [16]:
# model_without_outliers.to_csv("../result/Blend2_v3.csv", index=False)

In [17]:
# print(normal_predictions.shape)
# normal_predictions[0:20]

In [18]:
# print(model_without_outliers.shape)
# model_without_outliers[0:20]

## Step 4: Training Model Without Outliers for 5 fold (n_repeats = 2)

In [21]:
def lgbm_regression_train_n_repeats_2(train, target, test, param, features, categorical_feats):
    folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4520)
    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros(len(test))
    start = time.time()
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
        val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

        num_round = 11000
        clf = lgb.train(lgbparam, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
        oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions_lgb += clf.predict(test[features], num_iteration=clf.best_iteration) / (5 * 2)

    print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb, target)**0.5))
    return predictions_lgb, oof_lgb

In [22]:
lgbparam = {'num_leaves': 31,
            'boosting_type': 'rf',
             'min_data_in_leaf': 30, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.005,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1,
             "nthread": 4,
             "random_state": 4590}

with open('../input/train_test_target_with_target.pkl', 'rb') as f:
    [train, target, test, normal_idx, outlier_idx] = pickle.load(f)

df_train = train[train['outliers'] == 0]
df_target = target[normal_idx]
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]
    
normal_predictions_n_repeats_2, oof_n_repeats_2 = lgbm_regression_train_n_repeats_2(df_train, 
                                                                                    df_target, 
                                                                                    test, 
                                                                                    lgbparam, 
                                                                                    features, 
                                                                                    categorical_feats)

fold n°0




Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.62477	valid_1's rmse: 1.63771
[200]	training's rmse: 1.5903	valid_1's rmse: 1.60438
[300]	training's rmse: 1.57167	valid_1's rmse: 1.58714
[400]	training's rmse: 1.55985	valid_1's rmse: 1.57689
[500]	training's rmse: 1.55142	valid_1's rmse: 1.56994
[600]	training's rmse: 1.5449	valid_1's rmse: 1.56478
[700]	training's rmse: 1.53956	valid_1's rmse: 1.56081
[800]	training's rmse: 1.5351	valid_1's rmse: 1.55798
[900]	training's rmse: 1.53117	valid_1's rmse: 1.55579
[1000]	training's rmse: 1.52765	valid_1's rmse: 1.55407
[1100]	training's rmse: 1.52442	valid_1's rmse: 1.55264
[1200]	training's rmse: 1.52149	valid_1's rmse: 1.55151
[1300]	training's rmse: 1.51883	valid_1's rmse: 1.55073
[1400]	training's rmse: 1.51629	valid_1's rmse: 1.55009
[1500]	training's rmse: 1.51387	valid_1's rmse: 1.54955
[1600]	training's rmse: 1.5116	valid_1's rmse: 1.54909
[1700]	training's rmse: 1.50942	valid_1's rmse: 1.5487

[400]	training's rmse: 1.56235	valid_1's rmse: 1.56446
[500]	training's rmse: 1.55383	valid_1's rmse: 1.55786
[600]	training's rmse: 1.54713	valid_1's rmse: 1.55296
[700]	training's rmse: 1.54173	valid_1's rmse: 1.54953
[800]	training's rmse: 1.53719	valid_1's rmse: 1.54705
[900]	training's rmse: 1.53323	valid_1's rmse: 1.54513
[1000]	training's rmse: 1.52971	valid_1's rmse: 1.54358
[1100]	training's rmse: 1.52651	valid_1's rmse: 1.54236
[1200]	training's rmse: 1.52358	valid_1's rmse: 1.54143
[1300]	training's rmse: 1.52085	valid_1's rmse: 1.54068
[1400]	training's rmse: 1.5183	valid_1's rmse: 1.54009
[1500]	training's rmse: 1.51589	valid_1's rmse: 1.53961
[1600]	training's rmse: 1.51362	valid_1's rmse: 1.53919
[1700]	training's rmse: 1.51143	valid_1's rmse: 1.53883
[1800]	training's rmse: 1.50937	valid_1's rmse: 1.5386
[1900]	training's rmse: 1.50736	valid_1's rmse: 1.53841
[2000]	training's rmse: 1.50537	valid_1's rmse: 1.53815
[2100]	training's rmse: 1.50346	valid_1's rmse: 1.53801


[2600]	training's rmse: 1.49359	valid_1's rmse: 1.54037
[2700]	training's rmse: 1.49172	valid_1's rmse: 1.54028
[2800]	training's rmse: 1.48998	valid_1's rmse: 1.54018
[2900]	training's rmse: 1.48825	valid_1's rmse: 1.54008
[3000]	training's rmse: 1.48658	valid_1's rmse: 1.53999
[3100]	training's rmse: 1.48491	valid_1's rmse: 1.53993
[3200]	training's rmse: 1.48325	valid_1's rmse: 1.53986
[3300]	training's rmse: 1.48162	valid_1's rmse: 1.53978
[3400]	training's rmse: 1.48	valid_1's rmse: 1.53975
[3500]	training's rmse: 1.47836	valid_1's rmse: 1.5397
[3600]	training's rmse: 1.4767	valid_1's rmse: 1.53967
[3700]	training's rmse: 1.47507	valid_1's rmse: 1.53963
[3800]	training's rmse: 1.47346	valid_1's rmse: 1.53962
[3900]	training's rmse: 1.47187	valid_1's rmse: 1.53959
[4000]	training's rmse: 1.4703	valid_1's rmse: 1.53954
[4100]	training's rmse: 1.46875	valid_1's rmse: 1.53948
[4200]	training's rmse: 1.46718	valid_1's rmse: 1.53942
[4300]	training's rmse: 1.46572	valid_1's rmse: 1.5393

[3400]	training's rmse: 1.47967	valid_1's rmse: 1.53786
[3500]	training's rmse: 1.47795	valid_1's rmse: 1.53778
[3600]	training's rmse: 1.47633	valid_1's rmse: 1.53773
[3700]	training's rmse: 1.47471	valid_1's rmse: 1.53769
[3800]	training's rmse: 1.47312	valid_1's rmse: 1.53763
[3900]	training's rmse: 1.47153	valid_1's rmse: 1.53759
[4000]	training's rmse: 1.4699	valid_1's rmse: 1.53752
[4100]	training's rmse: 1.46832	valid_1's rmse: 1.53748
Early stopping, best iteration is:
[4091]	training's rmse: 1.46847	valid_1's rmse: 1.53747
CV score: 1.54074 


In [23]:
model_without_outliers_n_repeats_2 = pd.DataFrame({"card_id":test["card_id"].values})
model_without_outliers_n_repeats_2["target"] = normal_predictions_n_repeats_2

## Part 5: Training Model For Outliers Classification for 5 fold (n_repeats = 2)

In [100]:
def lgbm_classification_train_n_repeats_2(df_train, target, df_test, param, features, categorical_feats):
    folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=15)
    oof = np.zeros(len(df_train))
    predictions = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    start = time.time()
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
        val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / (5 * 2)
        
    print("CV score: {:<8.5f}".format(log_loss(target, oof)))

    return predictions, oof

In [101]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

with open('../input/train_test_target_with_target.pkl', 'rb') as f:
    [train, target, test, normal_idx, outlier_idx] = pickle.load(f)

target = train['outliers']
del train['outliers']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]
outlier_label_n_repeats_2, oof_class_n_repeats_2 = lgbm_classification_train_n_repeats_2(train, 
                                                                                         target, 
                                                                                         test, 
                                                                                         param, 
                                                                                         features, 
                                                                                         categorical_feats)

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0462539	valid_1's binary_logloss: 0.0483859
[200]	training's binary_logloss: 0.0462506	valid_1's binary_logloss: 0.0483834
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.0462377	valid_1's binary_logloss: 0.0482944
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0466662	valid_1's binary_logloss: 0.0470919
[200]	training's binary_logloss: 0.0466763	valid_1's binary_logloss: 0.047128
Early stopping, best iteration is:
[47]	training's binary_logloss: 0.0466228	valid_1's binary_logloss: 0.0470608
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0465256	valid_1's binary_logloss: 0.0466832
[200]	training's binary_logloss: 0.0464995	valid_1's binary_logloss: 0.0466729
[300]	training's binary_logloss: 0.0465142	valid_1's binary_logloss: 0.0466902
Early stopping,

In [32]:
df_outlier_prob_n_repeats_2 = pd.DataFrame({"card_id":test["card_id"].values})
df_outlier_prob_n_repeats_2["target"] = outlier_label_n_repeats_2
df_outlier_prob_n_repeats_2.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.024092
1,C_ID_130fd0cbdd,0.001818
2,C_ID_b709037bc5,0.011282
3,C_ID_d27d835a9f,0.001818
4,C_ID_2b5e3df5c2,0.001818


## Part 6: Combining Submission for 5 fold (n_repeats = 2)


In [33]:
outlier_id_n_repeats_2 = pd.DataFrame(\
                                      df_outlier_prob_n_repeats_2.sort_values(by='target',
                                                                              ascending = False)
                                      .head(25000)['card_id'])

In [38]:
best_submission = pd.read_csv('../result/Blend2_v2.csv')

In [39]:
print(best_submission.shape[0])
best_submission.head()

123623


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.346967
1,C_ID_130fd0cbdd,-0.35402
2,C_ID_b709037bc5,-0.932773
3,C_ID_d27d835a9f,-0.148607
4,C_ID_2b5e3df5c2,-1.090599


In [40]:
print(outlier_id.shape[0])
most_likely_liers_n_repeats_2 = best_submission.merge(outlier_id_n_repeats_2,how='right')
most_likely_liers_n_repeats_2.head()

25000


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.346967
1,C_ID_6d8dba8475,-0.881375
2,C_ID_4859ac9ed5,-0.641561
3,C_ID_7f1041e8e1,-5.193301
4,C_ID_22e4a47c72,0.341024


In [41]:
%%time
for card_id in most_likely_liers_n_repeats_2['card_id']:
    model_without_outliers_n_repeats_2.loc[model_without_outliers_n_repeats_2['card_id']==card_id,'target']\
    = most_likely_liers_n_repeats_2.loc[most_likely_liers_n_repeats_2['card_id']==card_id,'target'].values

CPU times: user 4min 16s, sys: 44 ms, total: 4min 16s
Wall time: 4min 16s


In [56]:
model_without_outliers_n_repeats_2.to_csv("../result/Blend2_v6_n_repeats_2.csv", index=False)

In [47]:
len(model_without_outliers['target'].values.tolist())

123623

In [48]:
len(model_without_outliers_n_repeats_2['target'].values.tolist())

123623

In [51]:
oof.shape

(199644,)

In [52]:
oof_n_repeats_2.shape

(199644,)

In [55]:
target.shape, len(normal_idx), len(outlier_idx)

((201917,), 199644, 2273)

## Part 7: Stacking

In [91]:
print(type(target))
print(type(oof))
print(type(oof_normal_final))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [104]:
len(outlier_idx), len(outlier_idx)

(2273, 2273)

In [95]:
# oof_normal_final

In [105]:
with open('../input/train_test_target_with_target.pkl', 'rb') as f:
    [train, target, test, normal_idx, outlier_idx] = pickle.load(f)
    
oof_normal_final = pd.Series(np.zeros(len(target)))
oof_normal_final[normal_idx] = oof
oof_normal_final[outlier_idx] = outlier_idx

oof_n_repeats_2_final = pd.Series(np.zeros(len(target)))
oof_n_repeats_2_final[normal_idx] = oof_n_repeats_2
oof_n_repeats_2_final[outlier_idx] = outlier_idx

In [114]:
train_stack = np.vstack([oof_normal_final,oof_n_repeats_2_final]).transpose()
test_stack = np.vstack([model_without_outliers['target'].values.tolist(),
                        model_without_outliers_n_repeats_2['target'].values.tolist()]).transpose()

folds = RepeatedKFold(n_splits=5,n_repeats=1,random_state=4520)
oof_stack = np.zeros(train_stack.shape[0])
predictions_stack = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, target)):
    print("fold n°{}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    print("-" * 10 + "Stacking " + str(fold_) + "-" * 10)
#     cb_model = CatBoostRegressor(iterations=3000, learning_rate=0.1, depth=8, l2_leaf_reg=20, bootstrap_type='Bernoulli',  eval_metric='RMSE', metric_period=50, od_type='Iter', od_wait=45, random_seed=17, allow_writing_files=False)
#     cb_model.fit(trn_data, trn_y, eval_set=(val_data, val_y), cat_features=[], use_best_model=True, verbose=True)
    clf = BayesianRidge()
    clf.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf.predict(val_data)
    predictions_stack += clf.predict(test_stack) / 5


np.sqrt(mean_squared_error(target.values, oof_stack))

fold n°0
----------Stacking 0----------
fold n°1
----------Stacking 1----------
fold n°2
----------Stacking 2----------
fold n°3
----------Stacking 3----------
fold n°4
----------Stacking 4----------


2.4648553532909525

In [115]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample_submission['target'] = predictions_stack
sample_submission.to_csv('../result/Bayesian_Ridge_Stacking.csv', index=False)

In [116]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample1 = pd.read_csv("../result/3.695.csv")
sample2 = pd.read_csv("../result/combining_submission (1).csv")
sample_submission['target'] = model_without_outliers['target'] * 0.5 + model_without_outliers_n_repeats_2['target'] * 0.5
sample_submission.to_csv("../result/Blend1.csv", index = False)
sample_submission['target'] = sample_submission['target'] * 0.2 + sample1['target'] * 0.2 + sample2['target'] * 0.6
sample_submission.to_csv('../result/Blend2_v6.csv', index=False)

`Blend2_v6.csv` got the best submission score so far - 3.691.