In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date
from scipy.stats import chisquare
from pandas.plotting import scatter_matrix
from tqdm import tqdm
import os
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
import gc
from collections import Counter
from datetime import datetime
from sklearn.model_selection import KFold, StratifiedKFold,cross_val_score,RandomizedSearchCV,GridSearchCV,RepeatedKFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import xgboost as xg 
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from tabulate import tabulate

In [2]:
# https://www.mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas/
# I copied code from above link and modified as per my requirement
# I have modified code and add unit in dataframe
# If int8 is range for -128 to 127 then uint8 is range from  0 to 255
# if thr is case if feature is range from 0 to 300 then we have to use int16 but insted of that we can use uint8 so we can save little bit of space
def reduce_memory(data):
    '''
     Reduce the memory usage by applying the smallest data type that can fit the range of values
     input:
      data: dataframe on which we want to perform reduce memory operation
     
     output:
      returns redused memory dataframe
    
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # Define list of datatypes that we want to reduce
    start_mem = data.memory_usage().sum() / 1024**2 # calculate mempry before precessing
    
    # Loop over all columns of data frame
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            
            if 'int' in str(col_type):
                if c_min >= 0: # if cmin is greater than and equal to 0 then we can use uint datatype
                    #if feature has only positive values then we will use uint data type
                    
                    if c_min >= np.iinfo(np.uint8).min and c_max <= np.iinfo(np.uint8).max: # check if range range of min and max is in between unit8 or not
                        data[col] = data[col].astype(np.uint8) # if it is in range of uint then assign uint8 datatype to given feature

                    elif c_min >= np.iinfo(np.uint16).min and c_max <= np.iinfo(np.uint16).max: # check if range range of min and max is in between unit16 or not
                        data[col] = data[col].astype(np.uint16)# if it is in range of uint then assign uint16 datatype to given feature
                        
                    elif c_min >= np.iinfo(np.uint32).min and c_max <= np.iinfo(np.uint32).max:# check if range range of min and max is in between unit32 or not
                        data[col] = data[col].astype(np.uint32) # if it is in range of uint then assign uint32 datatype to given feature
                        
                    else:
                        data[col] = data[col].astype(np.uint64) # assign uint64 datatype to given feature
                        
                else: # means feature has negative values also so in that condition we will use int data type which supports negative values
                    if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                        
                    elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                        
                    elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                      
                    else:
                        data[col] = data[col].astype(np.int64)          
               
            else: # if columns datatype is float
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                    
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                    
                else:
                    data[col] = data[col].astype(np.float64)
                   
               
           
    end_mem = data.memory_usage().sum() / 1024**2  
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return data 
#Exclude some non numeric columns like date format
FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers','hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size','new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size','OOF_PRED', 'month_0']
mode = lambda x: Counter(x).most_common(1)[0][0]

def aggregation_merchants(merchants):
    
    '''
      aggregation_merchants function will aggregate all features of merchants based on unique merchants id
      
      input:
       mearchants: merchants dataframe on which we want to perform aggregetion
       
      output:
       returns aggregated merchants dataframe
      
    '''
    agg_merchants=merchants.groupby('merchant_id',as_index=False).agg({
        "merchant_group_id": mode,
        "merchant_category_id": mode,
        "subsector_id": mode,
        "numerical_1": "mean",
        "numerical_2": "mean",
        "category_1": mode,
        "most_recent_sales_range": mode,
        "most_recent_purchases_range": mode,
        "avg_sales_lag3": "mean",
        "avg_purchases_lag3": "mean",
        "active_months_lag3": mode,
        "avg_sales_lag6": "mean",
        "avg_purchases_lag6": "mean",
        "active_months_lag6": mode,
        "avg_sales_lag12": "mean",
        "avg_purchases_lag12": "mean",
        "active_months_lag12": mode,
        "category_4": mode,
        "city_id": mode,
        "state_id": mode,
        "category_2": mode
    })
    return agg_merchants

def aggregation_transactions(transactions):
    '''
      aggregation_transactions function will aggregate all features of merchants based on unique merchants id
      
      input:
       transactions: transactions(new and historical) dataframe on which we want to perform aggregetion
       
      output:
       returns aggregated transactions dataframe
      
    '''
    group_transactions = transactions.groupby("card_id",as_index=False).agg({
        "authorized_flag": mode,
        "merchant_category_id": mode,
        "subsector_id": mode,
        "merchant_id":  mode,
        "category_1": mode,
        'month_lag': mode,
        "installments": mode,
        "purchase_amount": "mean",
        "city_id": mode,
        "state_id": mode,
        "category_2": mode,
        "purchase_date": mode
    })
    return group_transactions


def check_null(df,name):
    print('Null check for ',name)
    for col in df.columns:
        print(col , ' has ',df[col].isnull().sum())
       
    print('*'*50)
    
def rmse(y_true, y_pred):
    '''
    custom RMSE evalution metrics
    input:
     y_true : true labels
     y_pred : predicted labels
    
    output:
     return rmse score for given y_true and y_pred
    '''
    return np.sqrt(mean_squared_error(y_true, y_pred))    

def root_mean_squared_error(y_true, y_pred):
    """Root mean squared error regression loss"""
    return np.sqrt(np.mean(np.square(y_true-y_pred)))

rmse_scorer = make_scorer(root_mean_squared_error)

# Create agg object for transactional data
aggs = {
    'purchase_amount' : ['sum','max','min','mean','var','skew'],
    'installments' : ['sum','max','mean','var','skew'],
    'purchase_date' : ['max','min'],
    'month_lag' : ['max','min','mean','var','skew'],
    'month_diff':['max','min','mean','var','skew'],
    'weekend' : ['sum', 'mean'],
    'weekday' : ['sum', 'mean'],
    'authorized_flag': ['sum', 'mean'],
    'category_1': ['sum','mean', 'max','min'],
    'category_2': ['sum','mean', 'max','min'],
    'category_3': ['sum','mean', 'max','min'],
    'card_id' : ['size','count'],
    'month': ['nunique', 'mean', 'min', 'max'],
    'hour': ['nunique', 'mean', 'min', 'max'],
    'weekofyear': ['nunique', 'mean', 'min', 'max'],
    'day': ['nunique', 'mean', 'min', 'max'],
    'subsector_id': ['nunique',],
    'merchant_id': ['nunique',],
    'merchant_category_id' : ['nunique',],
    'price' :['sum','mean','max','min','var','skew'],
    'duration' : ['mean','min','max','var','skew'],
    'amount_month_ratio':['mean','min','max','var','skew'],
    'Christmas_Day_2017': ['mean'],
    'Mothers_Day_2017': ['mean'],
    'fathers_day_2017': ['mean'],
    'Children_day_2017': ['mean'],
    'Valentine_Day_2017': ['mean'],
    'Black_Friday_2017': ['mean'],
    'Mothers_Day_2018' : ['mean'],
#     "merchants_merchant_group_id": mode,
#     "merchants_merchant_category_id": mode,
#     "merchants_subsector_id": mode,
#     "merchants_numerical_1": "mean",
#     "merchants_numerical_2": "mean",
#     "merchants_category_1": mode,
#     "merchants_most_recent_sales_range": mode,
#     "merchants_most_recent_purchases_range": mode,
#     "merchants_avg_sales_lag3": "mean",
#     "merchants_avg_purchases_lag3": "mean",
#     "merchants_active_months_lag3": mode,
#     "merchants_avg_sales_lag6": "mean",
#     "merchants_avg_purchases_lag6": "mean",
#     "merchants_active_months_lag6": mode,
#     "merchants_avg_sales_lag12": "mean",
#     "merchants_avg_purchases_lag12": "mean",
#     "merchants_active_months_lag12": mode,
#     "merchants_category_4": mode,
#     "merchants_city_id": mode,
#     "merchants_state_id": mode,
#     "merchants_category_2": mode
}

def additional_features(df):

    df['card_id_total'] = df['new_card_id_size']+df['hist_card_id_size']
    df['card_id_cnt_total'] = df['new_card_id_count']+df['hist_card_id_count']
    df['card_id_cnt_ratio'] = df['new_card_id_count']/df['hist_card_id_count']
    df['purchase_amount_total'] = df['new_purchase_amount_sum']+df['hist_purchase_amount_sum']
    df['purchase_amount_mean'] = df['new_purchase_amount_mean']+df['hist_purchase_amount_mean']
    df['purchase_amount_max'] = df['new_purchase_amount_max']+df['hist_purchase_amount_max']
    df['purchase_amount_min'] = df['new_purchase_amount_min']+df['hist_purchase_amount_min']
    df['purchase_amount_ratio'] = df['new_purchase_amount_sum']/df['hist_purchase_amount_sum']
    df['month_diff_mean'] = df['new_month_diff_mean']+df['hist_month_diff_mean']
    df['month_diff_ratio'] = df['new_month_diff_mean']/df['hist_month_diff_mean']
    df['month_lag_mean'] = df['new_month_lag_mean']+df['hist_month_lag_mean']
    df['month_lag_max'] = df['new_month_lag_max']+df['hist_month_lag_max']
    df['month_lag_min'] = df['new_month_lag_min']+df['hist_month_lag_min']
    df['category_1_mean'] = df['new_category_1_mean']+df['hist_category_1_mean']
    df['installments_total'] = df['new_installments_sum']+df['hist_installments_sum']
    df['installments_mean'] = df['new_installments_mean']+df['hist_installments_mean']
    df['installments_max'] = df['new_installments_max']+df['hist_installments_max']
    df['installments_ratio'] = df['new_installments_sum']/df['hist_installments_sum']
    df['price_total'] = df['purchase_amount_total'] / df['installments_total']
    df['price_mean'] = df['purchase_amount_mean'] / df['installments_mean']
    df['price_max'] = df['purchase_amount_max'] / df['installments_max']
    df['duration_mean'] = df['new_duration_mean']+df['hist_duration_mean']
    df['duration_min'] = df['new_duration_min']+df['hist_duration_min']
    df['duration_max'] = df['new_duration_max']+df['hist_duration_max']
    df['amount_month_ratio_mean']=df['new_amount_month_ratio_mean']+df['hist_amount_month_ratio_mean']
    df['amount_month_ratio_min']=df['new_amount_month_ratio_min']+df['hist_amount_month_ratio_min']
    df['amount_month_ratio_max']=df['new_amount_month_ratio_max']+df['hist_amount_month_ratio_max']
    df['new_CLV'] = df['new_card_id_count'] * df['new_purchase_amount_sum'] / df['new_month_diff_mean']
    df['hist_CLV'] = df['hist_card_id_count'] * df['hist_purchase_amount_sum'] / df['hist_month_diff_mean']
    df['CLV_ratio'] = df['new_CLV'] / df['hist_CLV']

    return df

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

### Load Train, Test, Merchants and Transactional data

In [3]:
def load_train_test():
    '''
    load_train_test will load train and test data from csv file and perform some transformation and returns train and test data frame
    
    '''
    train_df = pd.read_csv('data/train.csv',parse_dates=["first_active_month"],index_col=['card_id'])# Read the train file
    test_df = pd.read_csv('data/test.csv',parse_dates=["first_active_month"],index_col=['card_id'])# Read the test file
    test_df['target'] = np.nan
    train_df['outliers'] = 0
    train_df.loc[train_df['target'] <= -33, 'outliers'] = 1
    train_df['first_active_month'] = pd.to_datetime(train_df['first_active_month'])
    test_df['first_active_month'] = pd.to_datetime(test_df['first_active_month'])
    
    train_df['quarter'] = train_df['first_active_month'].dt.quarter
    train_df['elapsed_time'] = (datetime.today() - train_df['first_active_month']).dt.days    
    train_df['days_feature1'] = train_df['elapsed_time'] * train_df['feature_1']
    train_df['days_feature2'] = train_df['elapsed_time'] * train_df['feature_2']
    train_df['days_feature3'] = train_df['elapsed_time'] * train_df['feature_3']  

    test_df['quarter'] = test_df['first_active_month'].dt.quarter
    test_df['elapsed_time'] = (datetime.today() - test_df['first_active_month']).dt.days      
    test_df['days_feature1'] = test_df['elapsed_time'] * test_df['feature_1']
    test_df['days_feature2'] = test_df['elapsed_time'] * test_df['feature_2']
    test_df['days_feature3'] = test_df['elapsed_time'] * test_df['feature_3']
    
    for f in ['feature_1','feature_2','feature_3']:
        order_label = train_df.groupby([f])['outliers'].mean()
        train_df[f] = train_df[f].map(order_label)
    
    train_df['feature_sum'] = train_df['feature_1'] + train_df['feature_2'] + train_df['feature_3']
    train_df['feature_mean'] = train_df['feature_sum']/3
    train_df['feature_max'] = train_df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    train_df['feature_min'] = train_df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    train_df['feature_var'] = train_df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)
    
    test_df['feature_sum'] = test_df['feature_1'] + test_df['feature_2'] + test_df['feature_3']
    test_df['feature_mean'] = test_df['feature_sum']/3
    test_df['feature_max'] = test_df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    test_df['feature_min'] = test_df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    test_df['feature_var'] = test_df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)
    
    return reduce_memory(train_df),reduce_memory(test_df)

def load_transactions(path):
    '''
    load_transactions function will load new and hitorical transaction data from csv file and perform transformation and feature engineering
    input:
     path: path to csv file (for new or historical transactions)
    output:
     returns preprocessed transactions dataframe
     
    '''
    trans_df = pd.read_csv(path)
    trans_df['category_2'].fillna(1.0,inplace=True) # # category 2 we will create new categoory with 6 for null values
    trans_df['category_3'].fillna('A',inplace=True)# category 3 we will create new categoory with D for null values
    trans_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)# most used merchant id is 'M_ID_00a6ca8a8a' so we will fill nan with this id
    trans_df['installments'].replace(-1, np.nan,inplace=True)
    trans_df['installments'].replace(999, np.nan,inplace=True)
    trans_df['purchase_amount'] = trans_df['purchase_amount'].apply(lambda x: min(x, 0.8))
    trans_df['purchase_date'] = pd.to_datetime(trans_df['purchase_date'])
    trans_df['hour'] = trans_df['purchase_date'].dt.hour # hrs from start of purchase date . eg. if data is 2018-03-11 14:57:36 then hr will be 14
    trans_df['day'] = trans_df['purchase_date'].dt.day # day of month eg. if data is 2018-03-11 14:57:36 then day willbe 11
    trans_df['weekday'] = trans_df['purchase_date'].dt.weekday # day of week eg. if data is 2018-03-11 14:57:36 then day 6 (sunday). starts from monday
    trans_df['weekend'] = (trans_df['purchase_date'].dt.weekday >=5).astype(int) # if weekday is greater than 5(means sat or sunday) then set to 1
    trans_df['month'] = trans_df['purchase_date'].dt.month # month of year eg. if data is 2018-03-11 14:57:36 then month will be 3(march)
    trans_df['weekofyear'] = trans_df['purchase_date'].dt.weekofyear # week of year eg if data is 2018-03-11 14:57:36 then weekofyear will be 10
    trans_df['hour'] =trans_df['purchase_date'].dt.hour# hour from the purchase date 
    trans_df['month_diff'] = ((datetime.today() - trans_df['purchase_date']).dt.days)//30
    trans_df['month_diff'] += trans_df['month_lag']
    trans_df['duration'] = trans_df['purchase_amount']*trans_df['month_diff']
    trans_df['amount_month_ratio'] = trans_df['purchase_amount']/trans_df['month_diff']
    trans_df['price'] = trans_df['purchase_amount'] / trans_df['installments']
    
    #Christmas : December 25 2017
    trans_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Mothers Day: May 14 2017
    trans_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #fathers day: August 13 2017
    trans_df['fathers_day_2017']=(pd.to_datetime('2017-08-13')-trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Childrens day: October 12 2017
    trans_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Valentine's Day : 12th June, 2017
    trans_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Black Friday : 24th November 2017
    trans_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    #2018
    #Mothers Day: May 13 2018
    trans_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-trans_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    
    authorized_flag = {'Y':1,'N':0}
    category_1 = {'Y':1,'N':0}
    category_3 = {'A': 0, 'B': 1, 'C': 2,'D': 3}
    
    trans_df['authorized_flag'] = trans_df['authorized_flag'].map(authorized_flag)
    trans_df['category_1'] = trans_df['category_1'].map(category_1)
    trans_df['category_3'] = trans_df['category_3'].map(category_3)
    
    return reduce_memory(trans_df)


def load_merchant():
    '''
     load_merchants fuction will load merchants data from csv file and perform preprocessing on it
    '''
    merchant_df = pd.read_csv('data/merchants.csv')
    merchant_df['category_2'].fillna(1.0,inplace=True) # category 2 we will create new categoory with 6.0 for null values
    merchant_df['avg_sales_lag3'].fillna(np.nanmedian(merchant_df['avg_sales_lag3']),inplace=True) # replace nan with median value
    merchant_df['avg_sales_lag6'].fillna(np.nanmedian(merchant_df['avg_sales_lag6']),inplace=True)# replace nan with median value
    merchant_df['avg_sales_lag12'].fillna(np.nanmedian(merchant_df['avg_sales_lag12']),inplace=True)# replace nan with median value
    
    merchant_df['avg_purchases_lag3'].fillna(np.nanmedian(merchant_df['avg_purchases_lag3']),inplace=True)# replace nan with median value
    merchant_df['avg_purchases_lag6'].fillna(np.nanmedian(merchant_df['avg_purchases_lag6']),inplace=True)# replace nan with median value
    merchant_df['avg_purchases_lag12'].fillna(np.nanmedian(merchant_df['avg_purchases_lag12']),inplace=True)# replace nan with median value
    
    category_1 = {'Y':1,'N':0}
    category_4 = {'Y':1,'N':0}
    most_recent_sales_range = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
    most_recent_purchases_range = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
    
    merchant_df['category_1'] = merchant_df['category_1'].map(category_1)
    merchant_df['category_4'] = merchant_df['category_4'].map(category_4)
    merchant_df['most_recent_sales_range'] = merchant_df['most_recent_sales_range'].map(most_recent_sales_range)
    merchant_df['most_recent_purchases_range'] = merchant_df['most_recent_purchases_range'].map(most_recent_purchases_range)
    
    return reduce_memory(merchant_df)


# Load train,test,merchnats, new transation and hstorcal transactions
train_df,test_df = load_train_test()
# new_trans_df = load_transactions('data/new_merchant_transactions.csv')
# hist_trans_df = load_transactions('data/historical_transactions.csv')
# merchant_df = load_merchant()

Mem. usage decreased to  8.47 Mb (67.6% reduction)
Mem. usage decreased to  5.19 Mb (65.6% reduction)




### Perform Aggregation on merchants data and merge with transactional data

In [4]:
#Perform Aggs om marchant
# we are saving aggregetion in pickle file for future use
# if os.path.isfile('merchant_df.pkl'):
#     merchant_df = pickle.load(open('merchant_df.pkl', 'rb'))   
# else:
#     merchant_df = load_merchant()
#     merchant_df = aggregation_merchants(merchant_df)
#     merchant_df.columns=['merchants_'+c if c!='merchant_id' else c for c in merchant_df.columns]#
#     pickle.dump((merchant_df),open('merchant_df.pkl','wb'))

    
# # Merge merchants data with new transactions and histrical transactions    
# new_trans_df = new_trans_df.merge(merchant_df,on='merchant_id',how='left')
# hist_trans_df = hist_trans_df.merge(merchant_df,on='merchant_id',how='left')

### Perform Aggregation on new Transactional data and store into pickle file

In [5]:
%%time
#Perform Aggs om new transactions data
# we are saving aggregetion in pickle file for future use
if os.path.isfile('new_transactions_df.pkl'):
    new_trans_df = pickle.load(open('new_transactions_df.pkl', 'rb'))   
else:
    new_trans_df = load_transactions('data/new_merchant_transactions.csv')
    new_trans_df = new_trans_df.reset_index().groupby('card_id').agg(aggs)
    new_trans_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_trans_df.columns.tolist()])
    new_trans_df.columns = ['new_'+ c for c in new_trans_df.columns]
    pickle.dump(new_trans_df,open('new_transactions_df.pkl','wb'))

Wall time: 1.38 s


### Perform Aggregation on Hitorical Transactional data and store into pickle file

In [6]:
%%time
#Perform Aggs on historical transactions data
# we are saving aggregetion in pickle file for future use
if os.path.isfile('hist_transactions_df.pkl'):
    hist_trans_df = pickle.load(open('hist_transactions_df.pkl', 'rb'))   
else:
    hist_trans_df = load_transactions('data/historical_transactions.csv')
    hist_trans_df = hist_trans_df.reset_index().groupby('card_id').agg(aggs)
    hist_trans_df.columns = pd.Index([e[0] + "_" + e[1] for e in hist_trans_df.columns.tolist()])
    hist_trans_df.columns = ['hist_'+ c for c in hist_trans_df.columns]
    pickle.dump(hist_trans_df,open('hist_transactions_df.pkl','wb'))

Wall time: 1.17 s


### Merge transactions data with train and test dataframe

In [7]:
# merge train df with new transactions list
train_df = train_df.merge(new_trans_df,on='card_id',how='left')
# merge test df with new transactions list
test_df = test_df.merge(new_trans_df,on='card_id',how='left')

# merge train df with history transactions list
train_df = train_df.merge(hist_trans_df,on='card_id',how='left')
# merge test df with history transactions list
test_df = test_df.merge(hist_trans_df,on='card_id',how='left')


In [8]:
train_df.replace([-np.inf, np.inf], np.nan, inplace=True)
test_df.replace([-np.inf, np.inf], np.nan, inplace=True)

#Add additional features
train_df = additional_features(train_df)
test_df = additional_features(test_df)

# In column name it is adding < and > for custom functions so we will remove < and >. 
train_df.columns = train_df.columns.str.replace('[#,@,&,<,>]', '')
test_df.columns = test_df.columns.str.replace('[#,@,&,<,>]', '') 

In [9]:
y_train = train_df['target'].values
X_train = train_df.drop(columns=['target'])
feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
X_train = X_train[feats]
X_train.replace([-np.inf, np.inf], np.nan, inplace=True)
X_train = X_train.fillna(X_train.median())
X_test = test_df[feats]
X_test.replace([-np.inf, np.inf], np.nan, inplace=True)
X_test = X_test.fillna(X_test.median())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


## Model-1 XGBoost with Stratified and KFold

In [10]:
def kfold_xgboost(train_df, test_df, num_folds, stratified = False, debug= False,save=False,save_path=False):
    print("Starting XGboost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    # Cross validation model
    if stratified:
        print('Stratified')
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        print('Kfold')
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # exclude some non numeric columns like dat
    feature_importance_df = pd.DataFrame()
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
        # set data structure
        xg_train = xg.DMatrix(train_x,label=train_y)
        xg_test = xg.DMatrix(valid_x,label=valid_y)
        params=  {
                'objective': 'reg:squarederror',
                'eval_metric': 'rmse',
                'booster': 'gbtree',
                'n_jobs': -1,
                'grow_policy': 'lossguide',
                'max_depth': 10,
                'seed': 538,
                'colsample_bylevel': 0.9,
                'colsample_bytree': 0.8,
                'gamma': 0.0001,
                'learning_rate': 0.01,
                'max_bin': 128,
                'max_leaves': 47,
                'min_child_weight': 40,
                'reg_alpha': 10.0,
                'reg_lambda': 10.0,
                'eta': 0.005,
                'subsample': 0.9
               }
        num_round = 10000
        watchlist = [(xg_train, 'train'), (xg_test, 'svalid')]
        reg = xg.train(params, xg_train, num_round, watchlist, early_stopping_rounds=500, verbose_eval=1000)
        print('train done')
        oof_preds[valid_idx] = reg.predict(xg.DMatrix(train_df.iloc[valid_idx][feats]), ntree_limit=reg.best_ntree_limit+50)
        sub_preds += reg.predict(xg.DMatrix(test_df[feats]), ntree_limit=reg.best_ntree_limit+50) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fi = reg.get_score(importance_type="gain")
        fold_importance_df["feature"] = list(fi.keys())
        fold_importance_df["importance"] = list(fi.values())
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        
        if save:
            if n_fold == (num_folds-1):
                print('reached last fold ',n_fold)
                test_df.loc[:,'target'] = sub_preds
                test_df = test_df.reset_index()
                test_df[['card_id', 'target']].to_csv(save_path, index=False)
                return feature_importance_df
        
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

In [143]:
xgboost_stratify = kfold_xgboost(train_df, test_df, num_folds=5, stratified=True,save=True,save_path='submission/xgboost_stratify_submission.csv') # xgboost with stratified

Starting XGboost. Train shape: (201917, 200), test shape: (123623, 199)
Stratified
[0]	train-rmse:3.94483	svalid-rmse:3.95444
Multiple eval metrics have been passed: 'svalid-rmse' will be used for early stopping.

Will train until svalid-rmse hasn't improved in 500 rounds.
[1000]	train-rmse:2.98694	svalid-rmse:3.68271
Stopping. Best iteration:
[658]	train-rmse:3.09241	svalid-rmse:3.68103

train done
Fold  1 RMSE : 3.681433
[0]	train-rmse:3.94688	svalid-rmse:3.94592
Multiple eval metrics have been passed: 'svalid-rmse' will be used for early stopping.

Will train until svalid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[487]	train-rmse:3.17004	svalid-rmse:3.63762

train done
Fold  2 RMSE : 3.638357
[0]	train-rmse:3.94563	svalid-rmse:3.95052
Multiple eval metrics have been passed: 'svalid-rmse' will be used for early stopping.

Will train until svalid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[464]	train-rmse:3.16003	svalid-rmse:3.67847

train done
F

<img src='https://i.imgur.com/6X3yGoB.png'>

In [1]:
# cols = list(xgboost_stratify[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:150].index)
# cols.append('outliers')
# cols.append('target')
# xgboost_stratify_feature = kfold_xgboost(train_df[cols], test_df[cols[:150]], num_folds=7, stratified=True,save=True,save_path='submission/xgboost_stratify_feature_submission.csv')

In [11]:
xgboost_kfold = kfold_xgboost(train_df, test_df, num_folds=5, stratified=False,save=True,save_path='submission/xgboost_kfold_submission.csv') # xgboost with KFold

Starting XGboost. Train shape: (201917, 200), test shape: (123623, 199)
Kfold
[0]	train-rmse:3.92324	svalid-rmse:4.03902
Multiple eval metrics have been passed: 'svalid-rmse' will be used for early stopping.

Will train until svalid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[415]	train-rmse:3.17928	svalid-rmse:3.74441

train done
Fold  1 RMSE : 3.744851
[0]	train-rmse:3.93793	svalid-rmse:3.98143
Multiple eval metrics have been passed: 'svalid-rmse' will be used for early stopping.

Will train until svalid-rmse hasn't improved in 500 rounds.
[1000]	train-rmse:2.96991	svalid-rmse:3.71221
Stopping. Best iteration:
[542]	train-rmse:3.11836	svalid-rmse:3.70955

train done
Fold  2 RMSE : 3.709997
[0]	train-rmse:3.96415	svalid-rmse:3.87567
Multiple eval metrics have been passed: 'svalid-rmse' will be used for early stopping.

Will train until svalid-rmse hasn't improved in 500 rounds.
[1000]	train-rmse:3.02565	svalid-rmse:3.57194
Stopping. Best iteration:
[608]	train-rmse:

<img src='https://i.imgur.com/qeUXiT9.png'>

In [None]:
# cols = list(xgboost_kfold[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:150].index)
# cols.append('outliers')
# cols.append('target')
# xgboost_kfold_feature = kfold_xgboost(train_df[cols], test_df[cols[:150]], num_folds=7, stratified=False,save=True,save_path='submission/xgboost_kfold_feature_submission.csv')

## Model-2 Decision Tree with  Stratified and KFold 

### Perform Hyperparameter Tuning for decision Tree

In [12]:
hyperparameter = dict(max_depth=[3,5,7,10],min_samples_split=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],min_samples_leaf=[4,5,6,7,8,9,10,11,12,15,17,20],max_leaf_nodes=[2,3,4,5,6])
dtreg = DecisionTreeRegressor()
clf = RandomizedSearchCV(dtreg, hyperparameter,random_state=1)
search = clf.fit(X_train, y_train)
search.best_params_

{'min_samples_split': 0.1,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 5,
 'max_depth': 5}

In [12]:
def decisonTree(train_df, test_df, num_folds, stratified = False,save=False,save_path=False):
    print("Starting Decision Tree. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    train_df.replace([-np.inf, np.inf], np.nan, inplace=True)
    train_df = train_df.fillna(train_df.median())
    
    test_df.replace([-np.inf, np.inf], np.nan, inplace=True)
    test_df = test_df.fillna(test_df.median())
    feature_importance_df = pd.DataFrame()
    # Cross validation model
    if stratified:
        print('Stratified')
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        print('Kfold')
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # exclude some non numeric columns like dat

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
        
        reg = DecisionTreeRegressor(min_samples_split= 0.2,min_samples_leaf= 12,max_leaf_nodes= 6,max_depth= 10)
        reg.fit(train_x,train_y)

        oof_preds[valid_idx] = reg.predict(train_df.iloc[valid_idx][feats])
        sub_preds += reg.predict(test_df[feats]) / folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fi = reg.feature_importances_
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = list(fi)
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        if save:
            if n_fold == (num_folds-1):
                print('reached last fold ',n_fold)

                test_df.loc[:,'target'] = sub_preds
                test_df = test_df.reset_index()
                test_df[['card_id', 'target']].to_csv(save_path, index=False)
                return feature_importance_df
        
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()  

In [13]:
dt_stratify = decisonTree(train_df, test_df, num_folds=7, stratified=True,save=True,save_path='submission/dt_stratify_submission.csv') # with stratified sampling

Starting Decision Tree. Train shape: (201917, 200), test shape: (123623, 199)


  after removing the cwd from sys.path.
  import sys


Stratified
Fold  1 RMSE : 3.776049
Fold  2 RMSE : 3.770363
Fold  3 RMSE : 3.749504
Fold  4 RMSE : 3.759901
Fold  5 RMSE : 3.780103
Fold  6 RMSE : 3.761936
Fold  7 RMSE : 3.763148
reached last fold  6


<img src='https://i.imgur.com/jXcSfXX.png'>

In [None]:
dt_kfold = decisonTree(train_df, test_df, num_folds=7, stratified=False,save=True,save_path='submission/dt_kfold_submission.csv')# with Kfold sampling

<img src='https://i.imgur.com/GDO24Z2.png'>

## Model-3 Random Forest with  Stratified and KFold 

In [16]:
hyperparameter = dict(max_depth=[3,5,7,10],min_samples_split=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],min_samples_leaf=[4,5,6,7,8,9,10,11,12,15,17,20],max_leaf_nodes=[2,3,4,5,6])
dtreg = RandomForestRegressor()
clf = RandomizedSearchCV(dtreg, hyperparameter,random_state=1)
search = clf.fit(X_train, y_train)
search.best_params_

{'min_samples_split': 0.1,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 5,
 'max_depth': 5}

In [15]:
def RandomForest(train_df, test_df, num_folds, stratified = False,save=False,save_path=False):
    print("Starting Decision Tree. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    train_df.replace([-np.inf, np.inf], np.nan, inplace=True)
    train_df = train_df.fillna(train_df.median())
    
    test_df.replace([-np.inf, np.inf], np.nan, inplace=True)
    test_df = test_df.fillna(test_df.median())
    feature_importance_df = pd.DataFrame()
    # Cross validation model
    if stratified:
        print('Stratified')
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        print('Kfold')
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # exclude some non numeric columns like dat

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
        
        reg = RandomForestRegressor(min_samples_split= 0.1,min_samples_leaf= 4,max_leaf_nodes= 5,max_depth= 5)
        reg.fit(train_x,train_y)

        oof_preds[valid_idx] = reg.predict(train_df.iloc[valid_idx][feats])
        sub_preds += reg.predict(test_df[feats]) / folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fi = reg.feature_importances_
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = list(fi)
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        
        if save:
            if n_fold == (num_folds-1):
                print('reached last fold ',n_fold)

                test_df.loc[:,'target'] = sub_preds
                test_df = test_df.reset_index()
                test_df[['card_id', 'target']].to_csv(save_path, index=False)
                return feature_importance_df
        
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()  

In [16]:
rf_stratify = RandomForest(train_df, test_df, num_folds=7, stratified=True,save=True,save_path='submission/rf_stratify_submission.csv') # with stratified sampling    

Starting Decision Tree. Train shape: (201917, 200), test shape: (123623, 199)


  after removing the cwd from sys.path.
  import sys


Stratified
Fold  1 RMSE : 3.766411
Fold  2 RMSE : 3.758557
Fold  3 RMSE : 3.742081
Fold  4 RMSE : 3.745719
Fold  5 RMSE : 3.769138
Fold  6 RMSE : 3.750210
Fold  7 RMSE : 3.750265
reached last fold  6


<img src='https://i.imgur.com/GJ8IoYD.png'>

In [17]:
rf_kfold = RandomForest(train_df, test_df, num_folds=7, stratified=False,save=True,save_path='submission/rf_kfold_submission.csv') # with KFold sampling 

Starting Decision Tree. Train shape: (201917, 200), test shape: (123623, 199)


  after removing the cwd from sys.path.
  import sys


Kfold
Fold  1 RMSE : 3.787401
Fold  2 RMSE : 3.875102
Fold  3 RMSE : 3.707327
Fold  4 RMSE : 3.732898
Fold  5 RMSE : 3.696004
Fold  6 RMSE : 3.791186
Fold  7 RMSE : 3.693462
reached last fold  6


<img src='https://i.imgur.com/ChYw5oo.png'>

## Model-4 Adaboost with  Stratified and KFold 

In [9]:
hyperparameter = dict(n_estimators=[120,140,160,170,180,190,200],learning_rate=[3,4,5,6,7,8,9])
adareg = AdaBoostRegressor(loss='square',random_state=1)
clf = RandomizedSearchCV(adareg, hyperparameter,random_state=1)
search = clf.fit(X_train, y_train)
search.best_params_

{'n_estimators': 200, 'learning_rate': 6}

In [18]:
def AdaBoost(train_df, test_df, num_folds, stratified = False,save=False,save_path=False):
    print("Starting Adaboost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    train_df.replace([-np.inf, np.inf], np.nan, inplace=True)
    train_df = train_df.fillna(train_df.median())
    
    test_df.replace([-np.inf, np.inf], np.nan, inplace=True)
    test_df = test_df.fillna(test_df.median())
    
    # Cross validation model
    if stratified:
        print('Stratified')
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        print('Kfold')
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # exclude some non numeric columns like dat

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
        
        reg = AdaBoostRegressor(n_estimators=200,learning_rate=6,loss='square',random_state=1)
        reg.fit(train_x,train_y)

        oof_preds[valid_idx] = reg.predict(train_df.iloc[valid_idx][feats])
        sub_preds += reg.predict(test_df[feats]) / folds.n_splits

        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        
        if save:
            if n_fold == (num_folds-1):
                print('reached last fold ',n_fold)

                test_df.loc[:,'target'] = sub_preds
                test_df = test_df.reset_index()
                test_df[['card_id', 'target']].to_csv(save_path, index=False)
                return reg
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()  

In [19]:
ada_stratify = AdaBoost(train_df, test_df, num_folds=7, stratified=True,save=True,save_path='submission/ada_stratify_submission.csv') # with stratified sampling    

Starting Adaboost. Train shape: (201917, 200), test shape: (123623, 199)


  after removing the cwd from sys.path.
  import sys


Stratified
Fold  1 RMSE : 4.757287
Fold  2 RMSE : 33.055655
Fold  3 RMSE : 14.591346
Fold  4 RMSE : 8.220139
Fold  5 RMSE : 16.157891
Fold  6 RMSE : 16.039135
Fold  7 RMSE : 15.709615
reached last fold  6


<img src='https://i.imgur.com/EiFPMnp.png'>

In [20]:
ada_kfold = AdaBoost(train_df, test_df, num_folds=7, stratified=False,save=True,save_path='submission/ada_kfold_submission.csv') # with KFold sampling    

Starting Adaboost. Train shape: (201917, 200), test shape: (123623, 199)


  after removing the cwd from sys.path.
  import sys


Kfold
Fold  1 RMSE : 12.349964
Fold  2 RMSE : 12.114483
Fold  3 RMSE : 32.430585
Fold  4 RMSE : 13.405033
Fold  5 RMSE : 13.361230
Fold  6 RMSE : 10.398093
Fold  7 RMSE : 12.507939
reached last fold  6


<img src='https://i.imgur.com/89cesH8.png'>

## Model-5 LGBM with Stratified and KFold

In [21]:
#https://www.kaggle.com/mfjwr1/simple-lightgbm-without-blending/code
# perform some hyperparameter optimization 
def kfold_lightgbm(train_df, test_df,params, num_folds, stratified = False, debug= False,save=False,save_path=False):
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    if stratified:
        print('Stratified')
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        print('Kfold')
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store re
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    feature_importance_df = pd.DataFrame()
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
#         print(n_fold)
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
        # set data structure
        lgb_train = lgb.Dataset(train_x,label=train_y,free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,label=valid_y,free_raw_data=False)
        
        # Fold dependent params
        params['seed']=int(2**n_fold),
        params['bagging_seed']=int(2**n_fold),
        params['drop_seed']=int(2**n_fold)
        
        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        if save:
            if n_fold == (num_folds-1):
#                 display_importances(feature_importance_df)
                print('reached last fold ',n_fold)

                test_df.loc[:,'target'] = sub_preds
                test_df = test_df.reset_index()
                test_df[['card_id', 'target']].to_csv(save_path, index=False)
                return feature_importance_df
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
                

In [20]:
# Stratified
# KFold
num_folds = 5
params ={
        'task': 'train',
        'boosting': 'goss',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.002,
        'max_depth': 6,
        'min_data_in_leaf': 16,
        'verbose': -1,
        'num_leaves':40
        }
# kfold_lightgbm(train_df, test_df,params, num_folds=num_folds, stratified=False)
min_leaf = [20,22] # 16,18,20,22,24
max_depth = [8,10] # 6,7,8,10,11
learning_rate = [0.004,0.006] # 0.002,0.004,0.006,0.008
num_leaves=[55,65] # 40,50,60,70
for i in min_leaf:
    for j in max_depth:
        for k in learning_rate:
            for l in num_leaves:
                params['min_data_in_leaf'] =  i
                params['max_depth'] =  j
                params['learning_rate'] =  k
                params['num_leaves'] =  l
                print('For min_data_in_leaf {}, max_depth {} learning_rate {} num_leaves {}'.format(i,j,k,l))
                kfold_lightgbm(train_df, test_df,params, num_folds=num_folds, stratified=True)
                print('*'*50)



For min_data_in_leaf 20, max_depth 8 learning_rate 0.004 num_leaves 55
Starting LightGBM. Train shape: (201917, 191), test shape: (123623, 190)
Stratified
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.72662	test's rmse: 3.77447
[200]	train's rmse: 3.65115	test's rmse: 3.73653
[300]	train's rmse: 3.59572	test's rmse: 3.71551
[400]	train's rmse: 3.55222	test's rmse: 3.70376
[500]	train's rmse: 3.51767	test's rmse: 3.69736
[600]	train's rmse: 3.48998	test's rmse: 3.69298
[700]	train's rmse: 3.46631	test's rmse: 3.68988
[800]	train's rmse: 3.44564	test's rmse: 3.68812
[900]	train's rmse: 3.42752	test's rmse: 3.68671
[1000]	train's rmse: 3.41039	test's rmse: 3.68543
[1100]	train's rmse: 3.39429	test's rmse: 3.68441
[1200]	train's rmse: 3.37836	test's rmse: 3.68359
[1300]	train's rmse: 3.36269	test's rmse: 3.68327
[1400]	train's rmse: 3.34832	test's rmse: 3.68293
[1500]	train's rmse: 3.33459	test's rmse: 3.68266
[1600]	train's rmse: 3.3214	test's rmse: 3

[400]	train's rmse: 3.53407	test's rmse: 3.69852
[500]	train's rmse: 3.49744	test's rmse: 3.69193
[600]	train's rmse: 3.468	test's rmse: 3.68746
[700]	train's rmse: 3.44298	test's rmse: 3.6844
[800]	train's rmse: 3.42096	test's rmse: 3.68209
[900]	train's rmse: 3.4016	test's rmse: 3.68071
[1000]	train's rmse: 3.3837	test's rmse: 3.67948
[1100]	train's rmse: 3.36637	test's rmse: 3.6785
[1200]	train's rmse: 3.34992	test's rmse: 3.6776
[1300]	train's rmse: 3.33391	test's rmse: 3.67763
[1400]	train's rmse: 3.3178	test's rmse: 3.67705
[1500]	train's rmse: 3.3019	test's rmse: 3.67699
[1600]	train's rmse: 3.28713	test's rmse: 3.67698
[1700]	train's rmse: 3.27285	test's rmse: 3.67721
Early stopping, best iteration is:
[1548]	train's rmse: 3.29482	test's rmse: 3.6768
Fold  3 RMSE : 3.676801
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.72333	test's rmse: 3.76709
[200]	train's rmse: 3.64457	test's rmse: 3.72581
[300]	train's rmse: 3.5849	test's rmse: 3.70312

Fold  1 RMSE : 3.681834
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.68535	test's rmse: 3.72878
[200]	train's rmse: 3.59527	test's rmse: 3.68089
[300]	train's rmse: 3.53094	test's rmse: 3.66001
[400]	train's rmse: 3.48253	test's rmse: 3.65135
[500]	train's rmse: 3.44677	test's rmse: 3.64652
[600]	train's rmse: 3.41679	test's rmse: 3.64387
[700]	train's rmse: 3.38931	test's rmse: 3.64246
[800]	train's rmse: 3.36395	test's rmse: 3.64195
[900]	train's rmse: 3.33921	test's rmse: 3.64179
[1000]	train's rmse: 3.31642	test's rmse: 3.64126
[1100]	train's rmse: 3.29509	test's rmse: 3.64158
[1200]	train's rmse: 3.27256	test's rmse: 3.64148
Early stopping, best iteration is:
[1006]	train's rmse: 3.31508	test's rmse: 3.64118
Fold  2 RMSE : 3.641176
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.67754	test's rmse: 3.74576
[200]	train's rmse: 3.58112	test's rmse: 3.71033
[300]	train's rmse: 3.51522	test's rmse: 3.69563
[400]

[1600]	train's rmse: 3.266	test's rmse: 3.65076
[1700]	train's rmse: 3.25083	test's rmse: 3.65077
[1800]	train's rmse: 3.23589	test's rmse: 3.6506
[1900]	train's rmse: 3.22113	test's rmse: 3.6508
[2000]	train's rmse: 3.20659	test's rmse: 3.65134
Early stopping, best iteration is:
[1805]	train's rmse: 3.23513	test's rmse: 3.65057
Fold  5 RMSE : 3.650574
**************************************************
For min_data_in_leaf 20, max_depth 10 learning_rate 0.004 num_leaves 65
Starting LightGBM. Train shape: (201917, 191), test shape: (123623, 190)
Stratified
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.71342	test's rmse: 3.7765
[200]	train's rmse: 3.62651	test's rmse: 3.73966
[300]	train's rmse: 3.55842	test's rmse: 3.71929
[400]	train's rmse: 3.50396	test's rmse: 3.70812
[500]	train's rmse: 3.46241	test's rmse: 3.70185
[600]	train's rmse: 3.42928	test's rmse: 3.69778
[700]	train's rmse: 3.40016	test's rmse: 3.69501
[800]	train's rmse: 3.37425	test's

[1400]	train's rmse: 3.18938	test's rmse: 3.67954
[1500]	train's rmse: 3.16977	test's rmse: 3.67992
Early stopping, best iteration is:
[1348]	train's rmse: 3.19986	test's rmse: 3.67935
Fold  3 RMSE : 3.679348
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.67758	test's rmse: 3.74705
[200]	train's rmse: 3.57644	test's rmse: 3.70809
[300]	train's rmse: 3.5048	test's rmse: 3.69113
[400]	train's rmse: 3.45539	test's rmse: 3.6833
[500]	train's rmse: 3.41527	test's rmse: 3.67918
[600]	train's rmse: 3.38279	test's rmse: 3.67715
[700]	train's rmse: 3.35354	test's rmse: 3.67509
[800]	train's rmse: 3.32641	test's rmse: 3.67388
[900]	train's rmse: 3.30025	test's rmse: 3.6735
[1000]	train's rmse: 3.27558	test's rmse: 3.67325
[1100]	train's rmse: 3.25297	test's rmse: 3.67385
Early stopping, best iteration is:
[996]	train's rmse: 3.27655	test's rmse: 3.67323
Fold  4 RMSE : 3.673232
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3

Early stopping, best iteration is:
[1562]	train's rmse: 3.34347	test's rmse: 3.64017
Fold  2 RMSE : 3.640175
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.72809	test's rmse: 3.76963
[200]	train's rmse: 3.6521	test's rmse: 3.73121
[300]	train's rmse: 3.59519	test's rmse: 3.71125
[400]	train's rmse: 3.5521	test's rmse: 3.6998
[500]	train's rmse: 3.51898	test's rmse: 3.69308
[600]	train's rmse: 3.4918	test's rmse: 3.68853
[700]	train's rmse: 3.46935	test's rmse: 3.68526
[800]	train's rmse: 3.44943	test's rmse: 3.68306
[900]	train's rmse: 3.43155	test's rmse: 3.68133
[1000]	train's rmse: 3.41468	test's rmse: 3.68045
[1100]	train's rmse: 3.3989	test's rmse: 3.6796
[1200]	train's rmse: 3.38346	test's rmse: 3.67903
[1300]	train's rmse: 3.36873	test's rmse: 3.67837
[1400]	train's rmse: 3.3546	test's rmse: 3.6782
[1500]	train's rmse: 3.3406	test's rmse: 3.67822
[1600]	train's rmse: 3.32723	test's rmse: 3.67834
[1700]	train's rmse: 3.31451	test's rmse: 3.678

[1000]	train's rmse: 3.39855	test's rmse: 3.65006
[1100]	train's rmse: 3.38185	test's rmse: 3.6493
[1200]	train's rmse: 3.36552	test's rmse: 3.6485
[1300]	train's rmse: 3.35033	test's rmse: 3.6482
[1400]	train's rmse: 3.33516	test's rmse: 3.64813
[1500]	train's rmse: 3.32	test's rmse: 3.64778
[1600]	train's rmse: 3.30528	test's rmse: 3.64762
[1700]	train's rmse: 3.29165	test's rmse: 3.64764
Early stopping, best iteration is:
[1587]	train's rmse: 3.30722	test's rmse: 3.64754
Fold  5 RMSE : 3.647543
**************************************************
For min_data_in_leaf 22, max_depth 8 learning_rate 0.006 num_leaves 55
Starting LightGBM. Train shape: (201917, 191), test shape: (123623, 190)
Stratified
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.68563	test's rmse: 3.75249
[200]	train's rmse: 3.59733	test's rmse: 3.71592
[300]	train's rmse: 3.5367	test's rmse: 3.70091
[400]	train's rmse: 3.49514	test's rmse: 3.69317
[500]	train's rmse: 3.46121	test's

[800]	train's rmse: 3.36458	test's rmse: 3.64964
[900]	train's rmse: 3.34084	test's rmse: 3.64917
[1000]	train's rmse: 3.31856	test's rmse: 3.64947
[1100]	train's rmse: 3.29744	test's rmse: 3.64944
Early stopping, best iteration is:
[907]	train's rmse: 3.33952	test's rmse: 3.64913
Fold  5 RMSE : 3.649130
**************************************************
For min_data_in_leaf 22, max_depth 10 learning_rate 0.004 num_leaves 55
Starting LightGBM. Train shape: (201917, 191), test shape: (123623, 190)
Stratified
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.72102	test's rmse: 3.77629
[200]	train's rmse: 3.63975	test's rmse: 3.73897
[300]	train's rmse: 3.57819	test's rmse: 3.71918
[400]	train's rmse: 3.52858	test's rmse: 3.70847
[500]	train's rmse: 3.49077	test's rmse: 3.7019
[600]	train's rmse: 3.46117	test's rmse: 3.69743
[700]	train's rmse: 3.43526	test's rmse: 3.69421
[800]	train's rmse: 3.4117	test's rmse: 3.69265
[900]	train's rmse: 3.39098	test's 

[700]	train's rmse: 3.41453	test's rmse: 3.65378
[800]	train's rmse: 3.38988	test's rmse: 3.65127
[900]	train's rmse: 3.36719	test's rmse: 3.64943
[1000]	train's rmse: 3.34681	test's rmse: 3.64807
[1100]	train's rmse: 3.32687	test's rmse: 3.64745
[1200]	train's rmse: 3.30701	test's rmse: 3.64675
[1300]	train's rmse: 3.28781	test's rmse: 3.64627
[1400]	train's rmse: 3.26948	test's rmse: 3.6458
[1500]	train's rmse: 3.25224	test's rmse: 3.64608
Early stopping, best iteration is:
[1383]	train's rmse: 3.27264	test's rmse: 3.64572
Fold  2 RMSE : 3.645716
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.7146	test's rmse: 3.77131
[200]	train's rmse: 3.62632	test's rmse: 3.73232
[300]	train's rmse: 3.55842	test's rmse: 3.71132
[400]	train's rmse: 3.50445	test's rmse: 3.69961
[500]	train's rmse: 3.46305	test's rmse: 3.69307
[600]	train's rmse: 3.43012	test's rmse: 3.68833
[700]	train's rmse: 3.40182	test's rmse: 3.6851
[800]	train's rmse: 3.37728	test's rmse: 3

[800]	train's rmse: 3.29757	test's rmse: 3.68717
[900]	train's rmse: 3.27091	test's rmse: 3.68646
[1000]	train's rmse: 3.24486	test's rmse: 3.68586
[1100]	train's rmse: 3.22169	test's rmse: 3.68531
[1200]	train's rmse: 3.19803	test's rmse: 3.68524
[1300]	train's rmse: 3.17565	test's rmse: 3.6853
Early stopping, best iteration is:
[1140]	train's rmse: 3.21199	test's rmse: 3.68499
Fold  1 RMSE : 3.684992
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.67351	test's rmse: 3.73336
[200]	train's rmse: 3.569	test's rmse: 3.68779
[300]	train's rmse: 3.494	test's rmse: 3.66698
[400]	train's rmse: 3.44267	test's rmse: 3.65867
[500]	train's rmse: 3.40224	test's rmse: 3.65399
[600]	train's rmse: 3.36817	test's rmse: 3.65132
[700]	train's rmse: 3.33732	test's rmse: 3.65017
[800]	train's rmse: 3.30896	test's rmse: 3.64914
[900]	train's rmse: 3.28104	test's rmse: 3.64819
[1000]	train's rmse: 3.25469	test's rmse: 3.64823
[1100]	train's rmse: 3.22951	test's rmse: 3.6

In [2]:
print('Hyperparameter and its RMSE value')
print(tabulate([[20,8,0.004,55,3.6492]
               ,[20,8,0.004,65,3.6471]
               ,[20,8,0.006,55,3.6507]
               ,[20,8,0.006,65,3.6482]
               ,[20,10,0.004,55,3.6506]
               ,[20,10,0.004,65,3.6530]
               ,[20,10,0.006,55,3.6520]
               ,[20,10,0.006,65,3.6510]
               ,[22,8,0.004,55,3.6478]
               ,[22,8,0.004,65,3.6475]
               ,[22,8,0.006,55,3.6493]
               ,[22,8,0.006,65,3.6491]
               ,[22,10,0.004,55,3.6490]
               ,[22,10,0.004,65,3.6505]
               ,[22,10,0.006,55,3.6458]
               ,[22,10,0.006,65,3.6506]], headers=['min_data_in_leaf','max_depth', 'learning_rate','num_leaves','RMSE score'], tablefmt='pretty'))

print('Best Hyperparameter for LGBM with Kfold is min_data_in_leaf=22, max_depth=10, learning_rate=0.006, num_leaves=55')

Hyperparameter and its RMSE value
+------------------+-----------+---------------+------------+------------+
| min_data_in_leaf | max_depth | learning_rate | num_leaves | RMSE score |
+------------------+-----------+---------------+------------+------------+
|        20        |     8     |     0.004     |     55     |   3.6492   |
|        20        |     8     |     0.004     |     65     |   3.6471   |
|        20        |     8     |     0.006     |     55     |   3.6507   |
|        20        |     8     |     0.006     |     65     |   3.6482   |
|        20        |    10     |     0.004     |     55     |   3.6506   |
|        20        |    10     |     0.004     |     65     |   3.653    |
|        20        |    10     |     0.006     |     55     |   3.652    |
|        20        |    10     |     0.006     |     65     |   3.651    |
|        22        |     8     |     0.004     |     55     |   3.6478   |
|        22        |     8     |     0.004     |     65     |   3.

In [12]:
num_folds = 11
params ={
        'task': 'train',
        'boosting': 'goss',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.006,
        'max_depth': 10,
        'min_data_in_leaf': 22,
        'verbose': -1,
        'num_leaves':55
        }

lgbm_stratify = kfold_lightgbm(train_df, test_df,params, num_folds=num_folds, stratified=True,save=True,save_path='submission/lgbm_stratify.csv')

Starting LightGBM. Train shape: (201917, 200), test shape: (123623, 199)
Stratified
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.65625	test's rmse: 3.72848
[200]	train's rmse: 3.58072	test's rmse: 3.70508
[300]	train's rmse: 3.53501	test's rmse: 3.69719
[400]	train's rmse: 3.50385	test's rmse: 3.69153
[500]	train's rmse: 3.48029	test's rmse: 3.68892
[600]	train's rmse: 3.46102	test's rmse: 3.68687
[700]	train's rmse: 3.44184	test's rmse: 3.68566
[800]	train's rmse: 3.42473	test's rmse: 3.68501
[900]	train's rmse: 3.40913	test's rmse: 3.68462
[1000]	train's rmse: 3.39488	test's rmse: 3.68415
[1100]	train's rmse: 3.38073	test's rmse: 3.68408
[1200]	train's rmse: 3.36677	test's rmse: 3.68439
[1300]	train's rmse: 3.35185	test's rmse: 3.68418
Early stopping, best iteration is:
[1105]	train's rmse: 3.37981	test's rmse: 3.68396
Fold  1 RMSE : 3.683958
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.66178	test's rmse: 3

<img src='https://i.imgur.com/VSr59vd.png'>

In [13]:
# Try with top 170 features
cols = list(lgbm_stratify[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:170].index)
cols.append('outliers')
cols.append('target')
lgbm_stratify_feature = kfold_lightgbm(train_df[cols], test_df[cols[:170]],params, num_folds=num_folds, stratified=True,save=True,save_path='submission/lgbm_stratify_feature.csv')

Starting LightGBM. Train shape: (201917, 102), test shape: (123623, 100)
Stratified
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.65822	test's rmse: 3.7282
[200]	train's rmse: 3.58441	test's rmse: 3.70357
[300]	train's rmse: 3.53906	test's rmse: 3.69463
[400]	train's rmse: 3.50847	test's rmse: 3.68918
[500]	train's rmse: 3.4851	test's rmse: 3.68553
[600]	train's rmse: 3.46603	test's rmse: 3.6835
[700]	train's rmse: 3.44789	test's rmse: 3.68249
[800]	train's rmse: 3.43109	test's rmse: 3.68242
[900]	train's rmse: 3.4156	test's rmse: 3.68181
[1000]	train's rmse: 3.4018	test's rmse: 3.68127
[1100]	train's rmse: 3.38746	test's rmse: 3.68051
[1200]	train's rmse: 3.37341	test's rmse: 3.68044
[1300]	train's rmse: 3.35997	test's rmse: 3.68059
[1400]	train's rmse: 3.34612	test's rmse: 3.68019
[1500]	train's rmse: 3.33382	test's rmse: 3.68041
[1600]	train's rmse: 3.31993	test's rmse: 3.6807
Early stopping, best iteration is:
[1426]	train's rmse: 3.34273	test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


<img src='https://i.imgur.com/cL5Ao9t.png'>

In [22]:
# KFold
num_folds = 5
params ={
        'task': 'train',
        'boosting': 'goss',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.003,
        'max_depth': 5,
        'min_data_in_leaf': 16,
        'verbose': -1,
        'num_leaves':40
        }
# kfold_lightgbm(train_df, test_df,params, num_folds=num_folds, stratified=False)
min_leaf = [20,22] # 16,18,20,22,24 # I have tried with commnetd hyperparameter values on different system and i can not perform on same system so i choose tope best values for each hyperparameter and shown here
max_depth = [4,5] # 3,4,5,6,7,8
learning_rate = [0.005,0.007] # 0.003,0.005,0.007
num_leaves=[55,65] # 45,55,65,75
for i in min_leaf:
    for j in max_depth:
        for k in learning_rate:
            for l in num_leaves:
                params['min_data_in_leaf'] =  i
                params['max_depth'] =  j
                params['learning_rate'] =  k
                params['num_leaves'] =  l
                print('For min_data_in_leaf {}, max_depth {} learning_rate {} num_leaves {}'.format(i,j,k,l))
                kfold_lightgbm(train_df, test_df,params, num_folds=num_folds, stratified=False)
                print('*'*50)

For min_data_in_leaf 20, max_depth 4 learning_rate 0.005 num_leaves 55
Starting LightGBM. Train shape: (201917, 200), test shape: (123623, 199)
Kfold
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.74058	test's rmse: 3.849
[200]	train's rmse: 3.6992	test's rmse: 3.80709
[300]	train's rmse: 3.67401	test's rmse: 3.7849
[400]	train's rmse: 3.65708	test's rmse: 3.77263
[500]	train's rmse: 3.645	test's rmse: 3.76461
[600]	train's rmse: 3.63551	test's rmse: 3.75909
[700]	train's rmse: 3.62775	test's rmse: 3.75544
[800]	train's rmse: 3.62077	test's rmse: 3.75255
[900]	train's rmse: 3.61439	test's rmse: 3.75044
[1000]	train's rmse: 3.60844	test's rmse: 3.74876
[1100]	train's rmse: 3.60295	test's rmse: 3.7473
[1200]	train's rmse: 3.59748	test's rmse: 3.74625
[1300]	train's rmse: 3.59237	test's rmse: 3.74535
[1400]	train's rmse: 3.58811	test's rmse: 3.74464
[1500]	train's rmse: 3.58369	test's rmse: 3.74392
[1600]	train's rmse: 3.57921	test's rmse: 3.74331
[170

[4000]	train's rmse: 3.50188	test's rmse: 3.71727
[4100]	train's rmse: 3.49893	test's rmse: 3.71727
[4200]	train's rmse: 3.49617	test's rmse: 3.71722
[4300]	train's rmse: 3.49329	test's rmse: 3.7172
[4400]	train's rmse: 3.49032	test's rmse: 3.71711
[4500]	train's rmse: 3.48729	test's rmse: 3.71703
[4600]	train's rmse: 3.48437	test's rmse: 3.71713
Early stopping, best iteration is:
[4426]	train's rmse: 3.48954	test's rmse: 3.71698
Fold  4 RMSE : 3.716984
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.78615	test's rmse: 3.66148
[200]	train's rmse: 3.74307	test's rmse: 3.62448
[300]	train's rmse: 3.71767	test's rmse: 3.60622
[400]	train's rmse: 3.70016	test's rmse: 3.59637
[500]	train's rmse: 3.68781	test's rmse: 3.59018
[600]	train's rmse: 3.67802	test's rmse: 3.58568
[700]	train's rmse: 3.67008	test's rmse: 3.58256
[800]	train's rmse: 3.66287	test's rmse: 3.58026
[900]	train's rmse: 3.65641	test's rmse: 3.57852
[1000]	train's rmse: 3.65057	test's rms

[4000]	train's rmse: 3.53864	test's rmse: 3.57392
[4100]	train's rmse: 3.53557	test's rmse: 3.57378
[4200]	train's rmse: 3.53268	test's rmse: 3.57376
[4300]	train's rmse: 3.52976	test's rmse: 3.57364
[4400]	train's rmse: 3.52685	test's rmse: 3.57356
[4500]	train's rmse: 3.52384	test's rmse: 3.57369
[4600]	train's rmse: 3.52119	test's rmse: 3.57361
Early stopping, best iteration is:
[4418]	train's rmse: 3.52636	test's rmse: 3.57353
Fold  3 RMSE : 3.573526
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.74781	test's rmse: 3.82087
[200]	train's rmse: 3.70462	test's rmse: 3.78277
[300]	train's rmse: 3.67971	test's rmse: 3.76306
[400]	train's rmse: 3.662	test's rmse: 3.75163
[500]	train's rmse: 3.6488	test's rmse: 3.74438
[600]	train's rmse: 3.63934	test's rmse: 3.7398
[700]	train's rmse: 3.63129	test's rmse: 3.73607
[800]	train's rmse: 3.62435	test's rmse: 3.73327
[900]	train's rmse: 3.61819	test's rmse: 3.73099
[1000]	train's rmse: 3.61229	test's rmse: 

[400]	train's rmse: 3.67936	test's rmse: 3.60166
[500]	train's rmse: 3.66841	test's rmse: 3.59577
[600]	train's rmse: 3.65894	test's rmse: 3.59161
[700]	train's rmse: 3.65095	test's rmse: 3.58886
[800]	train's rmse: 3.6438	test's rmse: 3.58676
[900]	train's rmse: 3.63728	test's rmse: 3.58443
[1000]	train's rmse: 3.63035	test's rmse: 3.58262
[1100]	train's rmse: 3.62428	test's rmse: 3.58131
[1200]	train's rmse: 3.61834	test's rmse: 3.58009
[1300]	train's rmse: 3.61284	test's rmse: 3.57923
[1400]	train's rmse: 3.60748	test's rmse: 3.57842
[1500]	train's rmse: 3.60216	test's rmse: 3.57771
[1600]	train's rmse: 3.59704	test's rmse: 3.57707
[1700]	train's rmse: 3.59204	test's rmse: 3.57631
[1800]	train's rmse: 3.58681	test's rmse: 3.57588
[1900]	train's rmse: 3.58189	test's rmse: 3.57533
[2000]	train's rmse: 3.57705	test's rmse: 3.57517
[2100]	train's rmse: 3.57228	test's rmse: 3.57468
[2200]	train's rmse: 3.56755	test's rmse: 3.57448
[2300]	train's rmse: 3.56307	test's rmse: 3.57431
[2400]	

[1300]	train's rmse: 3.61284	test's rmse: 3.57923
[1400]	train's rmse: 3.60748	test's rmse: 3.57842
[1500]	train's rmse: 3.60216	test's rmse: 3.57771
[1600]	train's rmse: 3.59704	test's rmse: 3.57707
[1700]	train's rmse: 3.59204	test's rmse: 3.57631
[1800]	train's rmse: 3.58681	test's rmse: 3.57588
[1900]	train's rmse: 3.58189	test's rmse: 3.57533
[2000]	train's rmse: 3.57705	test's rmse: 3.57517
[2100]	train's rmse: 3.57228	test's rmse: 3.57468
[2200]	train's rmse: 3.56755	test's rmse: 3.57448
[2300]	train's rmse: 3.56307	test's rmse: 3.57431
[2400]	train's rmse: 3.55865	test's rmse: 3.57391
[2500]	train's rmse: 3.55417	test's rmse: 3.57361
[2600]	train's rmse: 3.54978	test's rmse: 3.57345
[2700]	train's rmse: 3.54567	test's rmse: 3.57314
[2800]	train's rmse: 3.54142	test's rmse: 3.57297
[2900]	train's rmse: 3.53674	test's rmse: 3.57261
[3000]	train's rmse: 3.53281	test's rmse: 3.5725
[3100]	train's rmse: 3.52887	test's rmse: 3.57255
[3200]	train's rmse: 3.5249	test's rmse: 3.57272
Ea

Fold  3 RMSE : 3.567730
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.73202	test's rmse: 3.81237
[200]	train's rmse: 3.68051	test's rmse: 3.77215
[300]	train's rmse: 3.6475	test's rmse: 3.75221
[400]	train's rmse: 3.6234	test's rmse: 3.74037
[500]	train's rmse: 3.60547	test's rmse: 3.73383
[600]	train's rmse: 3.59169	test's rmse: 3.72941
[700]	train's rmse: 3.57983	test's rmse: 3.72649
[800]	train's rmse: 3.56926	test's rmse: 3.7243
[900]	train's rmse: 3.55952	test's rmse: 3.7224
[1000]	train's rmse: 3.55065	test's rmse: 3.72115
[1100]	train's rmse: 3.54281	test's rmse: 3.72022
[1200]	train's rmse: 3.53472	test's rmse: 3.71958
[1300]	train's rmse: 3.52728	test's rmse: 3.71886
[1400]	train's rmse: 3.52069	test's rmse: 3.71847
[1500]	train's rmse: 3.51359	test's rmse: 3.71794
[1600]	train's rmse: 3.50691	test's rmse: 3.7173
[1700]	train's rmse: 3.50052	test's rmse: 3.717
[1800]	train's rmse: 3.49445	test's rmse: 3.7168
[1900]	train's rmse: 3.48871	te

[1500]	train's rmse: 3.51359	test's rmse: 3.71794
[1600]	train's rmse: 3.50691	test's rmse: 3.7173
[1700]	train's rmse: 3.50052	test's rmse: 3.717
[1800]	train's rmse: 3.49445	test's rmse: 3.7168
[1900]	train's rmse: 3.48871	test's rmse: 3.71656
[2000]	train's rmse: 3.48241	test's rmse: 3.71643
[2100]	train's rmse: 3.47698	test's rmse: 3.71612
[2200]	train's rmse: 3.47108	test's rmse: 3.716
[2300]	train's rmse: 3.46552	test's rmse: 3.71603
[2400]	train's rmse: 3.46009	test's rmse: 3.71592
[2500]	train's rmse: 3.4542	test's rmse: 3.71595
[2600]	train's rmse: 3.44866	test's rmse: 3.71585
[2700]	train's rmse: 3.44328	test's rmse: 3.71571
[2800]	train's rmse: 3.43778	test's rmse: 3.71558
[2900]	train's rmse: 3.43269	test's rmse: 3.71554
[3000]	train's rmse: 3.42706	test's rmse: 3.71567
[3100]	train's rmse: 3.42196	test's rmse: 3.71559
Early stopping, best iteration is:
[2904]	train's rmse: 3.43251	test's rmse: 3.71549
Fold  4 RMSE : 3.715490
Training until validation scores don't improve f

[500]	train's rmse: 3.57712	test's rmse: 3.74684
[600]	train's rmse: 3.56275	test's rmse: 3.74455
[700]	train's rmse: 3.5506	test's rmse: 3.74285
[800]	train's rmse: 3.53898	test's rmse: 3.74178
[900]	train's rmse: 3.52881	test's rmse: 3.74149
[1000]	train's rmse: 3.519	test's rmse: 3.74091
[1100]	train's rmse: 3.50997	test's rmse: 3.74056
[1200]	train's rmse: 3.50069	test's rmse: 3.74015
[1300]	train's rmse: 3.49199	test's rmse: 3.74031
Early stopping, best iteration is:
[1190]	train's rmse: 3.50157	test's rmse: 3.74009
Fold  1 RMSE : 3.740094
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.71138	test's rmse: 3.77711
[200]	train's rmse: 3.65739	test's rmse: 3.7424
[300]	train's rmse: 3.62479	test's rmse: 3.72642
[400]	train's rmse: 3.60232	test's rmse: 3.71954
[500]	train's rmse: 3.5847	test's rmse: 3.71558
[600]	train's rmse: 3.57032	test's rmse: 3.71279
[700]	train's rmse: 3.55729	test's rmse: 3.71068
[800]	train's rmse: 3.54532	test's rmse: 3.709

[3700]	train's rmse: 3.51324	test's rmse: 3.70593
Early stopping, best iteration is:
[3530]	train's rmse: 3.51833	test's rmse: 3.70591
Fold  2 RMSE : 3.705910
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.78093	test's rmse: 3.68998
[200]	train's rmse: 3.73909	test's rmse: 3.64865
[300]	train's rmse: 3.71417	test's rmse: 3.62583
[400]	train's rmse: 3.69762	test's rmse: 3.6128
[500]	train's rmse: 3.68561	test's rmse: 3.60486
[600]	train's rmse: 3.67665	test's rmse: 3.59952
[700]	train's rmse: 3.66893	test's rmse: 3.59562
[800]	train's rmse: 3.66262	test's rmse: 3.5928
[900]	train's rmse: 3.65646	test's rmse: 3.59068
[1000]	train's rmse: 3.65102	test's rmse: 3.58869
[1100]	train's rmse: 3.64536	test's rmse: 3.58694
[1200]	train's rmse: 3.64055	test's rmse: 3.58548
[1300]	train's rmse: 3.63606	test's rmse: 3.5843
[1400]	train's rmse: 3.63179	test's rmse: 3.58296
[1500]	train's rmse: 3.62738	test's rmse: 3.582
[1600]	train's rmse: 3.62301	test's rmse: 3

[1900]	train's rmse: 3.56796	test's rmse: 3.74134
[2000]	train's rmse: 3.5641	test's rmse: 3.74095
[2100]	train's rmse: 3.56005	test's rmse: 3.74076
[2200]	train's rmse: 3.55641	test's rmse: 3.74051
[2300]	train's rmse: 3.55286	test's rmse: 3.74044
[2400]	train's rmse: 3.54917	test's rmse: 3.74051
Early stopping, best iteration is:
[2280]	train's rmse: 3.55345	test's rmse: 3.7404
Fold  1 RMSE : 3.740397
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.75102	test's rmse: 3.80306
[200]	train's rmse: 3.70774	test's rmse: 3.76798
[300]	train's rmse: 3.68234	test's rmse: 3.74899
[400]	train's rmse: 3.665	test's rmse: 3.73807
[500]	train's rmse: 3.65272	test's rmse: 3.73108
[600]	train's rmse: 3.64328	test's rmse: 3.72628
[700]	train's rmse: 3.63524	test's rmse: 3.72273
[800]	train's rmse: 3.62805	test's rmse: 3.71991
[900]	train's rmse: 3.62167	test's rmse: 3.71797
[1000]	train's rmse: 3.61591	test's rmse: 3.7162
[1100]	train's rmse: 3.61042	test's rmse: 3

[900]	train's rmse: 3.65807	test's rmse: 3.5783
[1000]	train's rmse: 3.65208	test's rmse: 3.57691
[1100]	train's rmse: 3.64615	test's rmse: 3.57566
[1200]	train's rmse: 3.64119	test's rmse: 3.57463
[1300]	train's rmse: 3.63644	test's rmse: 3.57371
[1400]	train's rmse: 3.632	test's rmse: 3.57305
[1500]	train's rmse: 3.62749	test's rmse: 3.57248
[1600]	train's rmse: 3.62308	test's rmse: 3.57196
[1700]	train's rmse: 3.61916	test's rmse: 3.57148
[1800]	train's rmse: 3.61528	test's rmse: 3.57122
[1900]	train's rmse: 3.61136	test's rmse: 3.57085
[2000]	train's rmse: 3.60766	test's rmse: 3.57054
[2100]	train's rmse: 3.60401	test's rmse: 3.5704
[2200]	train's rmse: 3.60024	test's rmse: 3.57005
[2300]	train's rmse: 3.59659	test's rmse: 3.56983
[2400]	train's rmse: 3.59278	test's rmse: 3.56974
[2500]	train's rmse: 3.58946	test's rmse: 3.56959
[2600]	train's rmse: 3.58631	test's rmse: 3.5697
[2700]	train's rmse: 3.58296	test's rmse: 3.56963
[2800]	train's rmse: 3.57981	test's rmse: 3.56968
Early 

[1200]	train's rmse: 3.61904	test's rmse: 3.57185
[1300]	train's rmse: 3.61337	test's rmse: 3.57131
[1400]	train's rmse: 3.60772	test's rmse: 3.5705
[1500]	train's rmse: 3.60229	test's rmse: 3.57017
[1600]	train's rmse: 3.59715	test's rmse: 3.56992
[1700]	train's rmse: 3.59215	test's rmse: 3.56994
[1800]	train's rmse: 3.58755	test's rmse: 3.56946
[1900]	train's rmse: 3.58265	test's rmse: 3.56945
[2000]	train's rmse: 3.57792	test's rmse: 3.56914
[2100]	train's rmse: 3.57351	test's rmse: 3.5693
[2200]	train's rmse: 3.5691	test's rmse: 3.5692
Early stopping, best iteration is:
[2000]	train's rmse: 3.57792	test's rmse: 3.56914
Fold  5 RMSE : 3.569137
**************************************************
For min_data_in_leaf 22, max_depth 4 learning_rate 0.007 num_leaves 65
Starting LightGBM. Train shape: (201917, 200), test shape: (123623, 199)
Kfold
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.72062	test's rmse: 3.82871
[200]	train's rmse: 3.67805	test'

[2100]	train's rmse: 3.57351	test's rmse: 3.5693
[2200]	train's rmse: 3.5691	test's rmse: 3.5692
Early stopping, best iteration is:
[2000]	train's rmse: 3.57792	test's rmse: 3.56914
Fold  5 RMSE : 3.569137
**************************************************
For min_data_in_leaf 22, max_depth 5 learning_rate 0.005 num_leaves 55
Starting LightGBM. Train shape: (201917, 200), test shape: (123623, 199)
Kfold
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.72525	test's rmse: 3.83999
[200]	train's rmse: 3.67431	test's rmse: 3.79587
[300]	train's rmse: 3.64214	test's rmse: 3.77351
[400]	train's rmse: 3.6193	test's rmse: 3.76114
[500]	train's rmse: 3.6023	test's rmse: 3.75372
[600]	train's rmse: 3.58844	test's rmse: 3.74883
[700]	train's rmse: 3.57717	test's rmse: 3.74602
[800]	train's rmse: 3.56741	test's rmse: 3.74368
[900]	train's rmse: 3.55817	test's rmse: 3.74251
[1000]	train's rmse: 3.54956	test's rmse: 3.74157
[1100]	train's rmse: 3.54177	test's rmse: 

Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.73628	test's rmse: 3.79534
[200]	train's rmse: 3.68506	test's rmse: 3.7583
[300]	train's rmse: 3.65241	test's rmse: 3.7391
[400]	train's rmse: 3.62988	test's rmse: 3.72786
[500]	train's rmse: 3.61281	test's rmse: 3.72142
[600]	train's rmse: 3.59918	test's rmse: 3.71744
[700]	train's rmse: 3.58667	test's rmse: 3.71424
[800]	train's rmse: 3.5762	test's rmse: 3.71212
[900]	train's rmse: 3.56672	test's rmse: 3.71062
[1000]	train's rmse: 3.5584	test's rmse: 3.70967
[1100]	train's rmse: 3.55018	test's rmse: 3.70889
[1200]	train's rmse: 3.54188	test's rmse: 3.70777
[1300]	train's rmse: 3.53416	test's rmse: 3.70723
[1400]	train's rmse: 3.5269	test's rmse: 3.70662
[1500]	train's rmse: 3.52036	test's rmse: 3.70636
[1600]	train's rmse: 3.51328	test's rmse: 3.70596
[1700]	train's rmse: 3.50692	test's rmse: 3.70587
[1800]	train's rmse: 3.50003	test's rmse: 3.70564
[1900]	train's rmse: 3.49363	test's rmse: 3.70526
[2

[1400]	train's rmse: 3.5283	test's rmse: 3.56937
[1500]	train's rmse: 3.52055	test's rmse: 3.56911
[1600]	train's rmse: 3.5127	test's rmse: 3.56894
[1700]	train's rmse: 3.50445	test's rmse: 3.56854
[1800]	train's rmse: 3.49739	test's rmse: 3.56858
[1900]	train's rmse: 3.48932	test's rmse: 3.56825
[2000]	train's rmse: 3.48193	test's rmse: 3.56794
[2100]	train's rmse: 3.47433	test's rmse: 3.56803
[2200]	train's rmse: 3.46681	test's rmse: 3.56804
[2300]	train's rmse: 3.45955	test's rmse: 3.56785
[2400]	train's rmse: 3.45247	test's rmse: 3.56798
Early stopping, best iteration is:
[2279]	train's rmse: 3.46112	test's rmse: 3.56778
Fold  3 RMSE : 3.567776
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.70785	test's rmse: 3.79221
[200]	train's rmse: 3.65336	test's rmse: 3.75496
[300]	train's rmse: 3.62023	test's rmse: 3.73864
[400]	train's rmse: 3.59847	test's rmse: 3.73067
[500]	train's rmse: 3.58136	test's rmse: 3.72613
[600]	train's rmse: 3.56753	test's r

In [3]:
print('Hyperparameter and its RMSE value For LGBM with Kfold')
print(tabulate([[20,4,0.005,55,3.5699]
               ,[20,4,0.005,65,3.5699]
               ,[20,4,0.007,55,3.5688]
               ,[20,4,0.007,65,3.5688]
               ,[20,5,0.005,55,3.5655]
               ,[20,5,0.005,65,3.5655]
               ,[20,5,0.007,55,3.5649]
               ,[20,5,0.007,65,3.5649]
                
               ,[22,4,0.005,55,3.5691]
               ,[22,4,0.005,65,3.5691]
               ,[22,4,0.007,55,3.5688]
               ,[22,4,0.007,65,3.5688]
               ,[22,5,0.005,55,3.5645]
               ,[22,5,0.005,65,3.5645]
               ,[22,5,0.007,55,3.5649]
               ,[22,5,0.007,65,3.5649]], headers=['min_data_in_leaf','max_depth', 'learning_rate','num_leaves','RMSE score'], tablefmt='pretty'))

print('Best Hyperparameter for LGBM with Kfold is min_data_in_leaf=22, max_depth=5, learning_rate=0.005, num_leaves=55')

Hyperparameter and its RMSE value For LGBM with Kfold
+------------------+-----------+---------------+------------+------------+
| min_data_in_leaf | max_depth | learning_rate | num_leaves | RMSE score |
+------------------+-----------+---------------+------------+------------+
|        20        |     4     |     0.005     |     55     |   3.5699   |
|        20        |     4     |     0.005     |     65     |   3.5699   |
|        20        |     4     |     0.007     |     55     |   3.5688   |
|        20        |     4     |     0.007     |     65     |   3.5688   |
|        20        |     5     |     0.005     |     55     |   3.5655   |
|        20        |     5     |     0.005     |     65     |   3.5655   |
|        20        |     5     |     0.007     |     55     |   3.5649   |
|        20        |     5     |     0.007     |     65     |   3.5649   |
|        22        |     4     |     0.005     |     55     |   3.5691   |
|        22        |     4     |     0.005    

In [17]:
num_folds = 11
params ={
        'task': 'train',
        'boosting': 'goss',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.005,
        'max_depth': 5,
        'min_data_in_leaf': 22,
        'verbose': -1,
        'num_leaves':55
        }

lgbm_kfold_model = kfold_lightgbm(train_df, test_df,params, num_folds=num_folds, stratified=False,save=True,save_path='submission/lgbm_kfold.csv')

Starting LightGBM. Train shape: (201917, 200), test shape: (123623, 199)
Kfold
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.65734	test's rmse: 3.73545
[200]	train's rmse: 3.5807	test's rmse: 3.70041
[300]	train's rmse: 3.53455	test's rmse: 3.68677
[400]	train's rmse: 3.5024	test's rmse: 3.68202
[500]	train's rmse: 3.47748	test's rmse: 3.68017
[600]	train's rmse: 3.45692	test's rmse: 3.67835
[700]	train's rmse: 3.43808	test's rmse: 3.67749
[800]	train's rmse: 3.42057	test's rmse: 3.67715
[900]	train's rmse: 3.40346	test's rmse: 3.67651
[1000]	train's rmse: 3.38762	test's rmse: 3.67652
Early stopping, best iteration is:
[893]	train's rmse: 3.40449	test's rmse: 3.67638
Fold  1 RMSE : 3.676377
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 3.65316	test's rmse: 3.78586
[200]	train's rmse: 3.57758	test's rmse: 3.7513
[300]	train's rmse: 3.53143	test's rmse: 3.7411
[400]	train's rmse: 3.49952	test's rmse: 3.73662
[500]	

<img src='https://i.imgur.com/2lJPFVf.png'>

In [2]:
# Try with top 170 features
cols = list(lgbm_kfold_model[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:170].index)
cols.append('outliers')
cols.append('target')
lgbm_kfold = kfold_lightgbm(train_df[cols], test_df[cols[:170]],params, num_folds=num_folds, stratified=False,save=True,save_path='submission/lgbm_kfold_feature_submission.csv')

<img src='https://i.imgur.com/YF0BRCd.png'>

# Deep Learning

In [1]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [177]:
train, cv = train_test_split(train_df,test_size=0.25, random_state=100)

In [178]:
y_train = train['target'].values
train = train.drop(columns=['target'])
feats = [f for f in train.columns if f not in FEATS_EXCLUDED]
X_train = train[feats]
X_train.replace([-np.inf, np.inf], np.nan, inplace=True)
X_train = X_train.fillna(X_train.median())

y_cv = cv['target'].values
cv = cv.drop(columns=['target'])
X_cv = cv[feats]
X_cv.replace([-np.inf, np.inf], np.nan, inplace=True)
X_cv = X_cv.fillna(X_cv.median())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [179]:
X_train.shape,X_cv.shape,y_train.shape,y_cv.shape

((151437, 191), (50480, 191), (151437,), (50480,))

In [180]:
activation = 'relu'
batch_size = 256
epochs = 100
initilizer = tf.keras.initializers.HeNormal(seed = 100)
optimizer = tf.keras.optimizers.Adam(0.01)
kernel_regularizer = tf.keras.regularizers.L2(0.001)

In [181]:
model = keras.Sequential()
model.add(layers.Input(shape=(191,)))
model.add(layers.Dense(1024,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(512,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(512,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(256,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(256,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(128,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(64,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(32,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(16,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(8,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(4,kernel_initializer=initilizer, activation=activation))
model.add(layers.Dense(1, kernel_initializer=initilizer))
# Compile model
model.compile(loss='mean_squared_error',metrics=tf.keras.metrics.RootMeanSquaredError(), optimizer=optimizer)

In [182]:
model.fit(X_train,y_train,epochs=epochs,
          validation_data=(X_cv,y_cv),
          batch_size=batch_size,verbose=1,)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x22b1d779448>

In [199]:
feats = [f for f in X_test.columns if f not in FEATS_EXCLUDED] # exclude some non numeric columns like dat
prediction = model.predict(X_test[feats])
prediction = np.reshape(prediction,(-1, ))
rows_dic = {'card_id': X_test.index.values, 'target': prediction}
df = pd.DataFrame(data=rows_dic)
df.to_csv('submission/deep_learning_submission.csv', index=False)

<img src='https://i.imgur.com/TSbudfH.png'>

# Conclusion:

### As we can see in table , XGBoost with Kfold  and LGBM with Kfold gives lowest  RMSE score 

In [5]:
print(tabulate([['XGBoost with stratify',3.650606]
               ,['XGBoost with Kfold',3.558708]
               ,['Decision Tree with stratify',3.762398]
               ,['Decision Tree with Kfold',3.702938]
               ,['Random Forest with Stratified',3.750282]
               ,['Random Forest with Kfold',3.695731]
               ,['Adaboost with Stratified',15.709615]
               ,['Adaboost with Kfold',16.433731]
               ,['LGBM with Stratified',3.645814]
               ,['LGBM with Kfold',3.564482]], headers=['Model','RMSE Score'], tablefmt='pretty'))

+-------------------------------+------------+
|             Model             | RMSE Score |
+-------------------------------+------------+
|     XGBoost with stratify     |  3.650606  |
|      XGBoost with Kfold       |  3.558708  |
|  Decision Tree with stratify  |  3.762398  |
|   Decision Tree with Kfold    |  3.702938  |
| Random Forest with Stratified |  3.750282  |
|   Random Forest with Kfold    |  3.695731  |
|   Adaboost with Stratified    | 15.709615  |
|      Adaboost with Kfold      | 16.433731  |
|     LGBM with Stratified      |  3.645814  |
|        LGBM with Kfold        |  3.564482  |
+-------------------------------+------------+
