# Single Model

In [1]:
import datetime
import gc
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings
import xlearn as xl
import scipy.sparse as sp
import catboost as cb

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
import numba
import pickle

import h2o
from h2o.automl import H2OAutoML


warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("***{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
@numba.jit
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_, model, straified_opt, figsize=(16, 50)):
    # cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=figsize)
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    if model == 'lgb':
        model_name = 'Lightgbm'
    elif model == 'xgb':
        model_name = 'Xgboost'
    elif model == 'cat':
        model_name = 'Catboost'
    plt.title(model_name + ' Features (avg over folds)')
    plt.tight_layout()
    if straified_opt:
        plt.savefig('../img/single_model_v'+str(write_ver)+'_importances_'+model+'_straified.png')
    else:
        plt.savefig('../img/single_model_v'+str(write_ver)+'_importances_'+model+'.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


def cleaning(train_df, test_df):
    train_df = train_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.replace([np.inf, -np.inf], np.nan)

    train_df = train_df.fillna(0)
    test_df = test_df.fillna(0)
    return train_df, test_df

In [3]:
# preprocessing train & test
def train_test(num_rows=None, debug=False):

    # load csv
    train_df = pd.read_csv('../input/train.csv', index_col=['card_id'], nrows=num_rows)
    test_df = pd.read_csv('../input/test.csv', index_col=['card_id'], nrows=num_rows)

    print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))

    # outlier
    train_df['outliers'] = 0
    train_df.loc[train_df['target'] < -30, 'outliers'] = 1

    # set target as nan
    test_df['target'] = np.nan

    # merge
    df = train_df.append(test_df)

    del train_df, test_df
    gc.collect()

    feat_list = ['feature_1', 'feature_2', 'feature_3']
    for feat in feat_list:
        df[feat + '_orig'] = df[feat]
    
    df['feature_1_2_cross'] = df['feature_1_orig'].astype(str).add('-').add(df['feature_2_orig'].astype(str))
    df['feature_1_3_cross'] = df['feature_1_orig'].astype(str).add('-').add(df['feature_3_orig'].astype(str))
    df['feature_2_3_cross'] = df['feature_2_orig'].astype(str).add('-').add(df['feature_3_orig'].astype(str))
    
    if debug:
        print_train_test_feat_cross(df)
    
    df['feature_1_2_cross'] = df['feature_1_2_cross'].map({'1-1': 0, '1-2': 1, '1-3': 2,
                                                           '2-1': 3, '2-2': 4, '2-3': 5, 
                                                           '3-1': 6, '3-2': 7, '3-3': 8, 
                                                           '4-1': 9, '4-2': 10, '4-3': 11,
                                                           '5-1': 12, '5-2': 13
                                                          }).astype(int)
    df['feature_1_3_cross'] = df['feature_1_3_cross'].map({'1-0': 0, '2-0': 1, '3-1': 2, '4-0': 3, '5-1': 4
                                                          }).astype(int)
    df['feature_2_3_cross'] = df['feature_2_3_cross'].map({'1-0': 0, '1-1': 1, '2-0': 2, '2-1': 3, '3-0': 4, '3-1': 5
                                                          }).astype(int)
    
    df = pd.get_dummies(df, columns=['feature_1', 'feature_2', 'feature_3', 
                                     'feature_1_2_cross', 'feature_1_3_cross', 'feature_2_3_cross'])
    for feat in feat_list:
        df[feat] = df[feat + '_orig'] 
    
    # to datetime
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])

    # datetime features
    # df['quarter'] = df['first_active_month'].dt.quarter
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days

    df['days_feature1'] = df['elapsed_time'] * df['feature_1']
    df['days_feature2'] = df['elapsed_time'] * df['feature_2']
    df['days_feature3'] = df['elapsed_time'] * df['feature_3']

    df['days_feature1_ratio'] = df['feature_1'] / df['elapsed_time']
    df['days_feature2_ratio'] = df['feature_2'] / df['elapsed_time']
    df['days_feature3_ratio'] = df['feature_3'] / df['elapsed_time']

    # one hot encoding
    df, cols = one_hot_encoder(df, nan_as_category=False)

    for f in ['feature_1','feature_2','feature_3']:
        order_label = df.groupby([f])['outliers'].mean()
        df[f] = df[f].map(order_label)

    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum']/3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_var'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)
    
    return df

In [4]:
def fill_na(df):
    df['category_2'].fillna(1.0,inplace=True)
    df["category_2"] = df["category_2"].astype(int)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
    df['installments'].replace(-1, np.nan,inplace=True)
    df['installments'].replace(999, np.nan,inplace=True)
    
    return df

def encode_to_numeric(df):
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_3'] = df['category_3'].map({'A':0, 'B':1, 'C':2})
    # df['category_4'] = df['category_4'].map({'Y': 2, 'N': 1, 'NaN':0}).astype(int)
    
    return df

def gen_datetime(df):
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >=5).astype(int)
    
    return df


def convert_categorical(df):
    
    feat_list = ['category_1', 'category_2', 'category_3', 'subsector_id']
    
    df['category_1_2_cross'] = df['category_1'].astype(str).add('_').add(df['category_2'].astype(str))
    df['category_1_3_cross'] = df['category_1'].astype(str).add('_').add(df['category_3'].astype(str))
    df['category_2_3_cross'] = df['category_2'].astype(str).add('_').add(df['category_3'].astype(str))
    # df['category_1_2_3_cross'] = df['category_1'].astype(str)\
    #                                .add('_').add(df['category_2'].astype(str))\
    #                                .add('_').add(df['category_3'].astype(str))
    
    for feat in feat_list:
        df[feat + '_orig'] = df[feat]
        
    df = pd.get_dummies(df, columns=['category_1', 'category_2', 'category_3', 
                                     'category_1_2_cross', 'category_1_3_cross', 'category_2_3_cross',
                                     'subsector_id'])
    
    drop_feat_list = ['category_2_0', 'category_3_0', 'category_4_0', 
                      'category_1_2_cross_0_0', 'category_1_2_cross_1_0', 'category_1_3_cross_0_0', 
                      'category_1_3_cross_1_0', 'category_2_3_cross_0_0', 'category_2_3_cross_0_1', 
                      'category_2_3_cross_0_2', 'category_2_3_cross_0_3', 'category_2_3_cross_1_0', 
                      'category_2_3_cross_2_0', 'category_2_3_cross_3_0', 'category_2_3_cross_4_0', 
                      'category_2_3_cross_5_0', 
                      'subsector_id_-1']
    for feat in drop_feat_list:
        if feat in df.columns:
            df = df.drop([feat], axis=1)
    
    for feat in feat_list:
        df = df.rename(index=str, columns={feat + '_orig': feat})
    
    for i in range(1, 42):
        if i != 6:
            df['subsector_id_'+str(i)+'_purchase_amount'] = df['subsector_id_'+str(i)] * df['purchase_amount']
            
    return df


def gen_other_feat(df):
    df['price'] = df['purchase_amount'] / df['installments']

    
    day_list = [7]
    for day in day_list:
#         #2017
#         #New year
#         df['new_year_2017_'+str(day)]=(pd.to_datetime('2017-01-01')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x <= day else 0)
#         #Tiradentes Day: April 21, 2017
#         df['Tiradentes_2017_'+str(day)]=(pd.to_datetime('2017-04-21')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #Labor Day: May 1, 2017
#         df['labor_2017_'+str(day)]=(pd.to_datetime('2017-05-01')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #Mothers Day: May 14 2017
#         df['Mothers_Day_2017_'+str(day)]=(pd.to_datetime('2017-06-04')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #Valentine's Day : 12th June, 2017
#         df['Valentine_Day_2017_'+str(day)]=(pd.to_datetime('2017-06-12')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #fathers day: August 13 2017
#         df['fathers_day_2017_'+str(day)]=(pd.to_datetime('2017-08-13')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #Christmas : December 25 2017
#         df['Christmas_Day_2017_'+str(day)]=(pd.to_datetime('2017-12-25')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x <= day else 0)
#         #Black Friday : 24th November 2017
#         df['Black_Friday_2017_'+str(day)]=(pd.to_datetime('2017-11-24') - df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #Childrens day: October 12 2017
#         df['Children_day_2017_'+str(day)]=(pd.to_datetime('2017-10-12')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x < day else 0)
#         #2018
#         #New year
#         df['new_year_2018_'+str(day)]=(pd.to_datetime('2018-01-01')-df['purchase_date']).dt.days.apply(lambda x: x if x >= 0 and x <= day else 0)
#         #Mothers Day: May 13 2018
#         df['Mothers_Day_2018_'+str(day)]=(pd.to_datetime('2018-05-13')-df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < day else 0)
        
        #2017
        #New year
        df['new_year_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-01-01')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x <= day else 0) * df['purchase_amount']
        #Valentine's Day : 12th June, 2017
        df['Valentine_Day_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-02-14')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Tiradentes Day: April 21, 2017
        df['Tiradentes_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-04-21')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Labor Day: May 1, 2017
        df['labor_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-05-01')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Mothers Day: May 14 2017
        df['Mothers_Day_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-05-14')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Corpus Christi
        df['Corpus_christi_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-06-20')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #fathers day: August 13 2017
        df['fathers_day_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-08-13')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Independence: Sep 7 2017
        df['independence_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-09-07')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Childrens day: October 12 2017
        df['Children_day_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-10-12')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #All soul's : 2th November 2017
        df['All_souls_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-11-02') - df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Republic day's : 2th November 2017
        df['Republic_day_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-11-15') - df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Black Friday : 24th November 2017
        df['Black_Friday_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-11-24') - df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Christmas : December 25 2017
        df['Christmas_Day_2017_'+str(day)+'_amt']=(pd.to_datetime('2017-12-25')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x <= day else 0) * df['purchase_amount']
        #2018
        #New year
        df['new_year_2018_'+str(day)+'_amt']=(pd.to_datetime('2018-01-01')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x <= day else 0) * df['purchase_amount']
        #Valentine's Day : 12th June, 2017
        df['Valentine_Day_2018_'+str(day)+'_amt']=(pd.to_datetime('2018-02-14')-df['purchase_date']).dt.days.apply(lambda x: 1 if x >= 0 and x < day else 0) * df['purchase_amount']
        #Mothers Day: May 13 2018
        df['Mothers_Day_2018_'+str(day)+'_amt']=(pd.to_datetime('2018-05-13')-df['purchase_date']).dt.days.apply(lambda x: 1 if x > 0 and x < day else 0) * df['purchase_amount']
        
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']
    
    df['category_1_0_month_diff'] = df['month_diff'] * df['category_1_0']
    df['category_1_1_month_diff'] = df['month_diff'] * df['category_1_1']
    df['category_2_1_month_diff'] = df['month_diff'] * df['category_2_1']
    df['category_2_2_month_diff'] = df['month_diff'] * df['category_2_2']
    df['category_2_3_month_diff'] = df['month_diff'] * df['category_2_3']
    df['category_2_4_month_diff'] = df['month_diff'] * df['category_2_4']
    df['category_2_5_month_diff'] = df['month_diff'] * df['category_2_5']
    df['category_3_1_month_diff'] = df['month_diff'] * df['category_3_1']
    df['category_3_2_month_diff'] = df['month_diff'] * df['category_3_2']

    # additional features
    df['duration'] = df['purchase_amount']*df['month_diff']
    df['amount_month_ratio'] = df['purchase_amount']/df['month_diff']
    
    # duration by category
    df['category_1_0_duration'] = df['duration'] * df['category_1_0']
    df['category_1_1_duration'] = df['duration'] * df['category_1_1']
    df['category_2_1_duration'] = df['duration'] * df['category_2_1']
    df['category_2_2_duration'] = df['duration'] * df['category_2_2']
    df['category_2_3_duration'] = df['duration'] * df['category_2_3']
    df['category_2_4_duration'] = df['duration'] * df['category_2_4']
    df['category_2_5_duration'] = df['duration'] * df['category_2_5']
    df['category_3_1_duration'] = df['duration'] * df['category_3_1']
    df['category_3_2_duration'] = df['duration'] * df['category_3_2']
    
    # installments by category
    df['category_1_0_installments'] = df['installments'] * df['category_1_0']
    df['category_1_1_installments'] = df['installments'] * df['category_1_1']
    df['category_2_1_installments'] = df['installments'] * df['category_2_1']
    df['category_2_2_installments'] = df['installments'] * df['category_2_2']
    df['category_2_3_installments'] = df['installments'] * df['category_2_3']
    df['category_2_4_installments'] = df['installments'] * df['category_2_4']
    df['category_2_5_installments'] = df['installments'] * df['category_2_5']
    df['category_3_1_installments'] = df['installments'] * df['category_3_1']
    df['category_3_2_installments'] = df['installments'] * df['category_3_2']
    
    return df


def feat_agg(df):
    col_unique =['subsector_id', 'merchant_id', 'merchant_category_id']
    col_seas = ['month', 'hour', 'weekofyear', 'weekday', 'day']
    col_cat = [feat for feat in list(df) if 'state_id_' in feat or 
                                                 'subsector_id_' in feat]
    # col_cat = [feat for feat in list(df) if 'state_id_' in feat or 
    #                                             'subsector_id_' in feat or
    #                                             'most_recent_sales_range_' in feat or
    #                                             'most_recent_purchases_range_' in feat]

    aggs = {}
    for col in col_unique:
        aggs[col] = ['nunique']

    for col in col_seas:
        aggs[col] = ['nunique', 'mean', 'min', 'max']
        
    for col in col_cat:
        aggs[col] = ['nunique', 'mean']

    aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
    aggs['installments'] = ['sum','max','mean','var','skew']
    aggs['category_1_0_installments'] = ['sum','max','mean','var']
    aggs['category_1_1_installments'] = ['sum','max','mean','var']
    aggs['category_2_1_installments'] = ['sum','max','mean','var']
    aggs['category_2_2_installments'] = ['sum','max','mean','var']
    aggs['category_2_3_installments'] = ['sum','max','mean','var']
    aggs['category_2_4_installments'] = ['sum','max','mean','var']
    aggs['category_2_5_installments'] = ['sum','max','mean','var']
    aggs['category_3_1_installments'] = ['sum','max','mean','var']
    aggs['category_3_2_installments'] = ['sum','max','mean','var']
    aggs['purchase_date'] = ['max','min']
    aggs['month_lag'] = ['max','min','mean','var','skew']
    aggs['month_diff'] = ['max','min','mean','var','skew']
    aggs['category_1_0_month_diff'] = ['max','min','mean','var']
    aggs['category_1_1_month_diff'] = ['max','min','mean','var']
    aggs['category_2_1_month_diff'] = ['max','min','mean','var']
    aggs['category_2_2_month_diff'] = ['max','min','mean','var']
    aggs['category_2_3_month_diff'] = ['max','min','mean','var']
    aggs['category_2_4_month_diff'] = ['max','min','mean','var']
    aggs['category_2_5_month_diff'] = ['max','min','mean','var']
    aggs['category_3_1_month_diff'] = ['max','min','mean','var']
    aggs['category_3_2_month_diff'] = ['max','min','mean','var']
    aggs['authorized_flag'] = ['mean']
    aggs['weekend'] = ['mean'] # overwrite
    aggs['weekday'] = ['mean'] # overwrite
    aggs['day'] = ['nunique', 'mean', 'min'] # overwrite
    aggs['category_1'] = ['mean']
    aggs['category_2'] = ['mean']
    aggs['category_3'] = ['mean']
    aggs['card_id'] = ['size','count']
    aggs['price'] = ['sum','mean','max','min','var']
    
    day_list = [7]
    for day in day_list:
#         aggs['new_year_2017_'+str(day)] = ['mean']
#         aggs['Tiradentes_2017_'+str(day)] = ['mean']
#         aggs['labor_2017_'+str(day)] = ['mean']
#         aggs['Mothers_Day_2017_'+str(day)] = ['mean']
#         aggs['Valentine_Day_2017_'+str(day)] = ['mean']
#         aggs['fathers_day_2017_'+str(day)] = ['mean']
#         aggs['Children_day_2017_'+str(day)] = ['mean']
#         aggs['Black_Friday_2017_'+str(day)] = ['mean']
#         aggs['Christmas_Day_2017_'+str(day)] = ['mean']
#         aggs['new_year_2018_'+str(day)] = ['mean']
#         aggs['Mothers_Day_2018_'+str(day)] = ['mean']
#        
        aggs['new_year_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Valentine_Day_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Tiradentes_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['labor_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Mothers_Day_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Corpus_christi_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['fathers_day_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['independence_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Children_day_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['All_souls_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Republic_day_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Black_Friday_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Christmas_Day_2017_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['new_year_2018_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Valentine_Day_2018_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        aggs['Mothers_Day_2018_'+str(day)+'_amt'] = ['mean', 'min', 'max', 'sum']
        
    aggs['duration']=['mean','min','max','var','skew']
    aggs['category_1_0_duration']=['mean','min','max','var']
    aggs['category_1_1_duration']=['mean','min','max','var']
    aggs['category_2_1_duration']=['mean','min','max','var']
    aggs['category_2_2_duration']=['mean','min','max','var']
    aggs['category_2_3_duration']=['mean','min','max','var']
    aggs['category_2_4_duration']=['mean','min','max','var']
    aggs['category_2_5_duration']=['mean','min','max','var']
    aggs['category_3_1_duration']=['mean','min','max','var']
    aggs['category_3_2_duration']=['mean','min','max','var']
    aggs['amount_month_ratio']=['mean','min','max','var']
    
    for i in range(1, 42):
        if i != 6:
            aggs['subsector_id_'+str(i)+'_purchase_amount'] = ['mean', 'min', 'max']

    for col in ['category_2','category_3']:
        df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
        df[col+'_min'] = df.groupby([col])['purchase_amount'].transform('min')
        df[col+'_max'] = df.groupby([col])['purchase_amount'].transform('max')
        df[col+'_sum'] = df.groupby([col])['purchase_amount'].transform('sum')
        aggs[col+'_mean'] = ['mean']

    df = df.reset_index().groupby('card_id').agg(aggs)
    
    df.columns = pd.Index([e[0] + "_" + e[1] for e in df.columns.tolist()])
    
    df['purchase_date_diff'] = (df['purchase_date_max']-df['purchase_date_min']).dt.days
    df['purchase_date_average'] = df['purchase_date_diff']/df['card_id_size']
    df['purchase_date_uptonow'] = (datetime.datetime.today()-df['purchase_date_max']).dt.days
    df['purchase_date_uptomin'] = (datetime.datetime.today()-df['purchase_date_min']).dt.days
    
    return df

In [5]:
# df = pd.read_csv('../input/historical_transactions.csv', nrows=None)
# train_df = pd.read_csv('../input/train.csv', index_col=['card_id'], nrows=None)
# df = pd.merge(df, train_df, on=['card_id'], how='left')


In [6]:
# list(df)

In [7]:
# outlier_cutoff = -15
# outliers = df.loc[df['target'] < outlier_cutoff]
# non_outliers = df.loc[df['target'] >= outlier_cutoff]
# print('{:d} outliers found (target < {:d})'.format(outliers.shape[0], outlier_cutoff))

# plt.figure(figsize=[10,5])
# plt.suptitle('Outlier vs. non-outlier feature distributions', fontsize=20, y=1.1)

# for num, col in enumerate(['feature_1', 'feature_2', 'feature_3', 
#                            'category_1', 'category_2', 'category_3', 
#                            'installments', 'target']):
#     if col is not 'target':
#         plt.subplot(3, 3, num+1)
#         v_c = non_outliers[col].value_counts() / non_outliers.shape[0]
#         plt.bar(v_c.index, v_c, label=('non-outliers'), align='edge', width=-0.3, edgecolor=[0.2]*3)
#         v_c = outliers[col].value_counts() / outliers.shape[0]
#         plt.bar(v_c.index, v_c, label=('outliers'), align='edge', width=0.3, edgecolor=[0.2]*3)
#         plt.title(col)
# #         plt.legend()

# plt.tight_layout()
# plt.show()

In [8]:
# preprocessing historical transactions
def transactions(source, num_rows=None):
    # load csv
    if source == 'hist':
        df = pd.read_csv('../input/historical_transactions.csv', nrows=num_rows)
    elif source == 'new':
        df = pd.read_csv('../input/new_merchant_transactions.csv', nrows=num_rows)
    # merchant_df = pd.read_csv('../input/merchants.csv', nrows=None)
    # df = pd.merge(df, merchant_df.drop(['category_1', 'category_2', 'city_id', 
    #                                               'merchant_category_id', 'subsector_id',
    #                                               'state_id'], axis=1), on='merchant_id', how='left')
    # del merchant_df
    # gc.collect()
    
    # fillna
    df = fill_na(df)
    
    # purchase_amount processing (https://www.kaggle.com/raddar/towards-de-anonymizing-the-data-some-insights)
    # df['purchase_amount'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    df['purchase_amount'] = np.round(df['purchase_amount'] / 0.00150265118 + 497.06,8)

    # Y/N to 1/0
    df = encode_to_numeric(df)

    # datetime features
    df = gen_datetime(df)

    # subsector_id and state_id features
    df = convert_categorical(df)
    
    # additional features
    df = gen_other_feat(df)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # agg features
    df = feat_agg(df)

    # change column name
    if source == 'hist':
        df.columns = ['hist_'+ c for c in df.columns]
    elif source == 'new':
        df.columns = ['new_'+ c for c in df.columns]

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

In [9]:
# additional features
def additional_features(df):
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['hist_last_buy'] = (df['hist_purchase_date_max'] - df['first_active_month']).dt.days
    df['new_first_buy'] = (df['new_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_last_buy'] = (df['new_purchase_date_max'] - df['first_active_month']).dt.days

    date_features=['hist_purchase_date_max','hist_purchase_date_min',
                   'new_purchase_date_max', 'new_purchase_date_min']

    for f in date_features:
        df[f] = df[f].astype(np.int64) * 1e-9

    df['card_id_total'] = df['new_card_id_size']+df['hist_card_id_size']
    df['card_id_cnt_total'] = df['new_card_id_count']+df['hist_card_id_count']
    df['card_id_cnt_ratio'] = df['new_card_id_count']/df['hist_card_id_count']
    df['purchase_amount_total'] = df['new_purchase_amount_sum']+df['hist_purchase_amount_sum']
    df['purchase_amount_mean'] = df['new_purchase_amount_mean']+df['hist_purchase_amount_mean']
    df['purchase_amount_max'] = df['new_purchase_amount_max']+df['hist_purchase_amount_max']
    df['purchase_amount_min'] = df['new_purchase_amount_min']+df['hist_purchase_amount_min']
    df['purchase_amount_ratio'] = df['new_purchase_amount_sum']/df['hist_purchase_amount_sum']
    df['month_diff_mean'] = df['new_month_diff_mean']+df['hist_month_diff_mean']
    df['month_diff_ratio'] = df['new_month_diff_mean']/df['hist_month_diff_mean']
    df['month_lag_mean'] = df['new_month_lag_mean']+df['hist_month_lag_mean']
    df['month_lag_max'] = df['new_month_lag_max']+df['hist_month_lag_max']
    df['month_lag_min'] = df['new_month_lag_min']+df['hist_month_lag_min']
    df['category_1_mean'] = df['new_category_1_mean']+df['hist_category_1_mean']
    df['installments_total'] = df['new_installments_sum']+df['hist_installments_sum']
    df['installments_mean'] = df['new_installments_mean']+df['hist_installments_mean']
    df['installments_max'] = df['new_installments_max']+df['hist_installments_max']
    df['installments_ratio'] = df['new_installments_sum']/df['hist_installments_sum']
    df['price_total'] = df['purchase_amount_total'] / df['installments_total']
    df['price_mean'] = df['purchase_amount_mean'] / df['installments_mean']
    df['price_max'] = df['purchase_amount_max'] / df['installments_max']
    df['duration_mean'] = df['new_duration_mean']+df['hist_duration_mean']
    df['duration_min'] = df['new_duration_min']+df['hist_duration_min']
    df['duration_max'] = df['new_duration_max']+df['hist_duration_max']
    df['amount_month_ratio_mean']=df['new_amount_month_ratio_mean']+df['hist_amount_month_ratio_mean']
    df['amount_month_ratio_min']=df['new_amount_month_ratio_min']+df['hist_amount_month_ratio_min']
    df['amount_month_ratio_max']=df['new_amount_month_ratio_max']+df['hist_amount_month_ratio_max']
    df['new_CLV'] = df['new_card_id_count'] * df['new_purchase_amount_sum'] / df['new_month_diff_mean']
    df['hist_CLV'] = df['hist_card_id_count'] * df['hist_purchase_amount_sum'] / df['hist_month_diff_mean']
    df['CLV_ratio'] = df['new_CLV'] / df['hist_CLV']

    return df

In [10]:
def kfold_model(train_df, test_df, num_folds, model,
                stratified=False, debug=False, saveOpt=True,
                feat_sorted=[], num_feat=None):
    if model == 'lgb':
        print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    elif model == 'xgb':
        print("Starting Xgboost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        n_repeats = 2
        folds = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=n_repeats, random_state=326)
        no_fold = num_folds * n_repeats 
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)
        no_fold = num_folds

    # Create arrays and dataframes to store results
    val_preds = np.zeros(train_df.shape[0])
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    if not feat_sorted:
        # print("feat_sorted is empty")
        feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    else:
        if num_feat == None:
            # print("feat_sorted is empty, num_feat is none")
            feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED and f in feat_sorted]
        else:
            # print("feat_sorted is empty, num_feat is not none")
            feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED and f in feat_sorted[:num_feat]]
    
    print("Final feat size: " + str(len(feats)))
    # print(feats)
    
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
        
        if model == 'lgb':
            # set data structure
            lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False)
            lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

            # params optimized by optuna
            params ={
                    'task': 'train',
                    'boosting': 'goss',
                    'objective': 'regression',
                    'metric': 'rmse',
                    'learning_rate': 0.01,
                    'subsample': 0.9855232997390695,
                    'max_depth': 7,
                    'top_rate': 0.9064148448434349,
                    'num_leaves': 63,
                    'min_child_weight': 41.9612869171337,
                    'other_rate': 0.0721768246018207,
                    'reg_alpha': 9.677537745007898,
                    'colsample_bytree': 0.5665320670155495,
                    'min_split_gain': 9.820197773625843,
                    'reg_lambda': 8.2532317400459,
                    'min_data_in_leaf': 21,
                    'verbose': -1,
                    'seed': int(2**n_fold),
                    'bagging_seed': int(2**n_fold),
                    'drop_seed': int(2**n_fold)
#                     'seed': 12, #int(2**n_fold),
#                     'bagging_seed': 34, # int(2**n_fold),
#                     'drop_seed': 56 #int(2**n_fold)
                    }

            reg = lgb.train(
                            params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_test],
                            valid_names=['train', 'test'],
                            num_boost_round=10000,
                            early_stopping_rounds= 200,
                            verbose_eval=100
                            )
            best_iteration = reg.best_iteration
            val_preds[valid_idx] = valid_y
            oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=best_iteration)
            sub_preds += reg.predict(test_df[feats], num_iteration=best_iteration) / no_fold
            
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', 
                                                                               iteration=best_iteration))
            
        elif model == 'cat':
            cat_train = cb.Pool(train_x, train_y)
            cat_test = cb.Pool(valid_x, valid_y)
            
            params = {'gpu_id': 0, 
                      #'n_gpus': 2, 
                      'objective': 'reg:linear', 
                      'eval_metric': 'rmse', 
                      'silent': True, 
                      'booster': 'gbtree', 
                      'n_jobs': -1, 
                      'n_estimators': 2500, 
                      'tree_method': 'gpu_hist', 
                      'grow_policy': 'lossguide', 
                      'max_depth': 12, 
                      'seed': 538, 
                      'colsample_bylevel': 0.9, 
                      'colsample_bytree': 0.8, 
                      'gamma': 0.0001, 
                      'learning_rate': 0.006150886706231842, 
                      'max_bin': 128, 
                      'max_leaves': 47, 
                      'min_child_weight': 40, 
                      'reg_alpha': 10.0, 
                      'reg_lambda': 10.0, 
                      'subsample': 0.9}
            
            
            num_round = 10
            reg = cb.CatBoostRegressor(max_depth=11,
                                       learning_rate=0.005, 
                                       eval_metric='RMSE', 
                                       iterations=num_round, 
                                       early_stopping_rounds=200)
            reg.fit(cat_train, verbose_eval = 200, eval_set = cat_test)
            val_preds[valid_idx] = valid_y
            oof_preds[valid_idx] = reg.predict(cat_test)
            sub_preds += reg.predict(test_df[feats]) / no_fold
            
            fold_importance_df = pd.DataFrame()
            fold_importance_df = pd.DataFrame(list(zip(train_x.dtypes.index, 
                                                       reg.get_feature_importance(cb.Pool(train_x, 
                                                                                          label=train_y)))),
                                              columns=['feature','importance'])

        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
    
    final_rmse = rmse(val_preds, oof_preds)
    print('Final RMSE : %.6f \n\n\n' % final_rmse)
    # display importances
    #display_importances(feature_importance_df)

    if not debug and saveOpt:
        # save submission file
        test_df.loc[:,'target'] = sub_preds
        test_df = test_df.reset_index()
        if num_feat == None:
            if stratified:
                test_df[['card_id', 'target']].to_csv('../result/'+submission_file_name+"_"+ model +"_"+ str(num_folds) +"_fold_stratified_all_feat_"+str(round(final_rmse, 4))+".csv", index=False)
            else:
                test_df[['card_id', 'target']].to_csv('../result/'+submission_file_name+"_"+ model +"_"+ str(num_folds) +"_fold_all_feat_"+str(round(final_rmse, 4))+".csv", index=False)
        else:
            if stratified:
                test_df[['card_id', 'target']].to_csv('../result/'+submission_file_name+"_"+ model +"_"+ str(num_folds) +"_fold_stratified_"+str(num_feat)+"_feat_"+str(round(final_rmse, 4))+".csv", index=False)
            else:
                test_df[['card_id', 'target']].to_csv('../result/'+submission_file_name+"_"+ model +"_"+ str(num_folds) +"_fold_"+str(num_feat)+"_feat_"+str(round(final_rmse, 4))+".csv", index=False)
    
    feat_sel_summary = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    feat_sorted = feat_sel_summary["feature"].tolist()
    
    return feature_importance_df, sub_preds, feat_sorted, oof_preds

In [11]:
def combine_solution(test_df, sub_preds_list, submission_file_name):
    sub_preds = [0.0 for _ in range(len(sub_preds_list[0]))]
    for sol in sub_preds_list:
        sub_preds += sol/len(sub_preds_list)
    test_df.loc[:,'target'] = sub_preds
    test_df = test_df.reset_index()
    test_df[['card_id', 'target']].to_csv('../result/'+submission_file_name+'_final.csv', index=False)

In [13]:
def stacking(oof, predictions):
    train_stack = np.vstack(oof).transpose()
    test_stack = np.vstack(predictions).transpose()

    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train['outliers'].values)):
        print("fold n°{}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        clf = Ridge(alpha=1)
        clf.fit(trn_data, trn_y)

        oof[val_idx] = clf.predict(val_data)
        predictions += clf.predict(test_stack) / folds.n_splits


    print(np.sqrt(mean_squared_error(target.values, oof)))

#### Default case

In [16]:
%%time
load_ver = 24
write_ver = 25
feat_gen_opt = False
debug = False
FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0',
                  'feature_1_2_cross_10','feature_1_2_cross_13','feature_2_2','feature_1_2_cross_7','feature_1_2_cross_1','feature_2_3_cross_3',
                 ]

submission_file_name = "single_model_v"+str(write_ver)
with timer("Full model run"):
    # main(debug=False)
    num_rows = 1000000 if debug else None
    if feat_gen_opt:
        with timer("train & test"):
            df = train_test(num_rows)
            hist = transactions('hist', num_rows)
            new = transactions('new', num_rows)
        with open('../input/single_model_v'+str(write_ver)+'_data_part1.pkl', 'wb') as f:
            pickle.dump([df, hist, new], f)
        with timer("historical transactions"):
            df = pd.merge(df, hist, on='card_id', how='outer')
        with timer("new merchants"):
            df = pd.merge(df, new, on='card_id', how='outer')
#         with timer("additional features"):
#             df = additional_features(df)
        with timer("split train & test"):
            train_df = df[df['target'].notnull()]
            test_df = df[df['target'].isnull()]
            del df
            gc.collect()
        with open('../input/single_model_v'+str(write_ver)+'_data_part2.pkl', 'wb') as f:
            pickle.dump([train_df, test_df], f)
        with timer("cleaning"):
            train_df_cleaned, test_df_cleaned = cleaning(train_df, test_df)
        with open('../input/single_model_v'+str(write_ver)+'_data_part3.pkl', 'wb') as f:
            pickle.dump([train_df_cleaned, test_df_cleaned], f)
    else:
        with open('../input/single_model_v'+str(load_ver)+'_data_part3.pkl', 'rb') as f:
            [train_df_cleaned, test_df_cleaned] = pickle.load(f)
            print("train_df_cleaned size: " + str(train_df_cleaned.shape))
            print("test_df_cleaned size: " + str(test_df_cleaned.shape))
    
        with open('../input/single_model_v'+str(load_ver)+'_data_lgb_feat_sorted.pkl', 'rb') as f:
            feat_sorted = pickle.load(f)
    
    with timer("Run LightGBM with kfold"):
#         feature_importance_df, sub_preds, feat_sorted = kfold_model(train_df_cleaned, test_df_cleaned, 
#                                                                     num_folds=10, model='lgb',
#                                                                     stratified=False, debug=debug, saveOpt=True,
#                                                                     feat_sorted=[], num_feat=None)
#         with open('../input/single_model_v'+str(load_ver)+'_data_lgb_feat_sorted.pkl', 'wb') as f:
#             pickle.dump(feat_sorted, f)

        feature_importance_df_1, sub_preds_1, \
            feat_sorted_1, oof_preds_1 = kfold_model(train_df_cleaned, test_df_cleaned, 
                                                     num_folds=10, model='lgb',
                                                     stratified=False, debug=debug, saveOpt=True,
                                                     feat_sorted=feat_sorted, num_feat=100)
        feature_importance_df_2, sub_preds_2, \
            feat_sorted_2, oof_pred_2 = kfold_model(train_df_cleaned, test_df_cleaned, 
                                                    num_folds=10, model='lgb',
                                                    stratified=True, debug=debug, saveOpt=True,
                                                    feat_sorted=feat_sorted, num_feat=200)
        feature_importance_df_3, sub_preds_3, \
#             feat_sorted_3, oof_preds_3 = kfold_model(train_df_cleaned, test_df_cleaned, 
#                                                      num_folds=10, model='cat',
#                                                      stratified=False, debug=debug, saveOpt=True,
#                                                      feat_sorted=feat_sorted, num_feat=100)
#         feature_importance_df_4, sub_preds_4, feat_sorted_4 = kfold_model(train_df_cleaned, test_df_cleaned,
#                                                                           num_folds=10, model='xgb',
#                                                                           stratified=True, debug=debug, saveOpt=True,
#                                                                           feat_sorted=feat_sorted, num_feat=200)
#     with timer("combine solutions"):
#         # sub_preds_list = [sub_preds_1, sub_preds_2, sub_preds_3, sub_preds_4]
#         sub_preds_list = [sub_preds_1, sub_preds_2]
#         combine_solution(test_df_cleaned, sub_preds_list, submission_file_name)
    with timer("stacking"):
        # stacking([oof_preds_1, oof_pred_2, oof_pred_3], [sub_preds_1, sub_preds_2, sub_preds_3])
        stacking([oof_preds_1, oof_pred_2], [sub_preds_1, sub_preds_2])
    # with open('../input/single_model_v'+str(write_ver)+'_data_part4.pkl', 'wb') as f:
    #    feat_summary_1 = feature_importance_df_1[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    #    feat_summary_2 = feature_importance_df_2[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
    #    pickle.dump([feat_summary_1, feat_summary_2], f)
    

train_df_cleaned size: (201917, 938)
test_df_cleaned size: (123623, 938)
Starting LightGBM. Train shape: (201917, 938), test shape: (123623, 938)
Final feat size: 100
Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 3.65076	test's rmse: 3.75282
[200]	train's rmse: 3.57083	test's rmse: 3.71365
[300]	train's rmse: 3.51802	test's rmse: 3.70019
[400]	train's rmse: 3.48145	test's rmse: 3.69447
[500]	train's rmse: 3.45408	test's rmse: 3.69174
[600]	train's rmse: 3.43032	test's rmse: 3.69008
[700]	train's rmse: 3.40871	test's rmse: 3.68953
[800]	train's rmse: 3.38885	test's rmse: 3.6896
[900]	train's rmse: 3.37142	test's rmse: 3.6899
[1000]	train's rmse: 3.35409	test's rmse: 3.69046
Early stopping, best iteration is:
[820]	train's rmse: 3.38513	test's rmse: 3.68912
Fold  1 RMSE : 3.689118
Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 3.63903	test's rmse: 3.83059
[200]	train's rmse: 3.55972	test's rmse: 3.79922
[300]	train'

[1200]	train's rmse: 3.30053	test's rmse: 3.65318
[1300]	train's rmse: 3.28434	test's rmse: 3.65356
Early stopping, best iteration is:
[1121]	train's rmse: 3.3131	test's rmse: 3.65286
Fold  1 RMSE : 3.652864
Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 3.65354	test's rmse: 3.68102
[200]	train's rmse: 3.56927	test's rmse: 3.64002
[300]	train's rmse: 3.51377	test's rmse: 3.62547
[400]	train's rmse: 3.47458	test's rmse: 3.61909
[500]	train's rmse: 3.44407	test's rmse: 3.6161
[600]	train's rmse: 3.41875	test's rmse: 3.61383
[700]	train's rmse: 3.39547	test's rmse: 3.6124
[800]	train's rmse: 3.37447	test's rmse: 3.61076
[900]	train's rmse: 3.35572	test's rmse: 3.61014
[1000]	train's rmse: 3.33768	test's rmse: 3.60959
[1100]	train's rmse: 3.32083	test's rmse: 3.61019
[1200]	train's rmse: 3.30527	test's rmse: 3.61055
Early stopping, best iteration is:
[1000]	train's rmse: 3.33768	test's rmse: 3.60959
Fold  2 RMSE : 3.609591
Training until validation score

NameError: name 'feature_importance_df_3' is not defined

In [None]:
display_importances(feature_importance_df, 'lgb', False, figsize=(16, 200))
# display_importances(feature_importance_df_1, 'cat', False, figsize=(16, 200))

- `single_model_v19_xgb_100_feat.csv` got 3.695 on LB.
- `single_model_v19_lgb_10_fold_stratified_100_feat_3.6425.csv` got 3.695 on LB.
- `single_model_v13_final.csv` got 3.694 on LB.
- `single_model_v19_lgb_10_fold_stratified_50_feat_3.6501.csv` got 3.699 on LB.
- `single_model_v19_lgb_10_fold_100_feat_3.6436.csv` got 3.695 on LB.
- `single_model_v20_lgb_10_fold_100_feat_3.6436.csv` got 3.694 on LB.
- `single_model_v22_lgb_10_fold_100_feat_3.6406.csv` got 3.694 on LB.
- `Blend2_v10.csv` got 3.691 on LB.

In [None]:
feature_importance_df_1, sub_preds_1, feat_sorted_1 = kfold_model(train_df_cleaned, test_df_cleaned, 
                                                                  num_folds=10, model='lgb',
                                                                  stratified=False, debug=debug, saveOpt=True,
                                                                  feat_sorted=feat_sorted, num_feat=80)

In [None]:
feature_importance_df_1, sub_preds_1, feat_sorted_1 = kfold_model(train_df_cleaned, test_df_cleaned, 
                                                                  num_folds=10, model='lgb',
                                                                  stratified=False, debug=debug, saveOpt=True,
                                                                  feat_sorted=feat_sorted, num_feat=120)

In [None]:
feat_sel_summary = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
feat_sel_summary

In [None]:
# temp = train_df_cleaned[feat_sorted[:200]].corrwith(train_df['target']).reset_index()
temp = train_df_cleaned.corrwith(train_df_cleaned['target']).reset_index()
temp = temp.rename(columns={'index': 'feat', 0: 'corr'}).sort_values(by='corr', ascending=False)
temp

In [None]:
train_df['new_duration_var'].unique()

In [None]:
train_df_cleaned, test_df_cleaned = cleaning(train_df, test_df)
train_df['new_duration_var'].unique()