## Kaggle Entry on ELO

See previous notebook for EDA on same dataset

Feature Engineering and Modelling




In [15]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.linear_model import Ridge
import time
from sklearn import preprocessing
import warnings
import datetime
warnings.filterwarnings("ignore")
import gc
from tqdm import tqdm

from scipy.stats import describe
%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
# Any results you write to the current directory are saved as output

In [16]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [17]:
#Loading Train and Test Data
path = "data\\"
train = pd.read_csv(path+"train.csv", parse_dates=["first_active_month"])
test = pd.read_csv(path+"test.csv", parse_dates=["first_active_month"])
print("{} observations and {} features in train set.".format(train.shape[0],train.shape[1]))
print("{} observations and {} features in test set.".format(test.shape[0],test.shape[1]))

201917 observations and 6 features in train set.
123623 observations and 5 features in test set.


### Feature Engineering

In [18]:
train['first_active_month'] = pd.to_datetime(train['first_active_month'])
test['first_active_month'] = pd.to_datetime(test['first_active_month'])
train['elapsed_time'] = (datetime.date(2018, 2, 1) - train['first_active_month'].dt.date).dt.days
test['elapsed_time'] = (datetime.date(2018, 2, 1) - test['first_active_month'].dt.date).dt.days

target = train['target']
del train['target']

train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,elapsed_time
0,2017-06-01,C_ID_92a2005557,5,2,1,245
1,2017-01-01,C_ID_3d0044924f,4,1,0,396
2,2016-08-01,C_ID_d639edf6cd,2,2,0,549
3,2017-09-01,C_ID_186d6a6901,4,3,0,153
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,92


In [19]:
train.dtypes

first_active_month    datetime64[ns]
card_id                       object
feature_1                      int64
feature_2                      int64
feature_3                      int64
elapsed_time                   int64
dtype: object

In [20]:
new_transactions = pd.read_csv(path+'new_merchant_transactions.csv', parse_dates=['purchase_date'])
historical_transactions = pd.read_csv(path+'historical_transactions.csv', parse_dates=['purchase_date'])

In [21]:
new_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [22]:
#turn Y/N in these two columns into 1/0 
def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

In [23]:
historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [24]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,1,C_ID_4e6213e9bc,88,0,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,1,C_ID_4e6213e9bc,88,0,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [25]:
historical_transactions.dtypes

authorized_flag                  int64
card_id                         object
city_id                          int64
category_1                       int64
installments                     int64
category_3                      object
merchant_category_id             int64
merchant_id                     object
month_lag                        int64
purchase_amount                float64
purchase_date           datetime64[ns]
category_2                     float64
state_id                         int64
subsector_id                     int64
dtype: object

In [26]:
#one hot encode 
historical_transactions = pd.get_dummies(historical_transactions, columns=['category_2', 'category_3'])
new_transactions = pd.get_dummies(new_transactions, columns=['category_2', 'category_3'])

#downcast datatypes
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Mem. usage decreased to 1304.89 Mb (54.8% reduction)
Mem. usage decreased to 84.24 Mb (56.7% reduction)


In [27]:
historical_transactions.dtypes

authorized_flag                   int8
card_id                         object
city_id                          int16
category_1                        int8
installments                     int16
merchant_category_id             int16
merchant_id                     object
month_lag                         int8
purchase_amount                float32
purchase_date           datetime64[ns]
state_id                          int8
subsector_id                      int8
category_2_1.0                   uint8
category_2_2.0                   uint8
category_2_3.0                   uint8
category_2_4.0                   uint8
category_2_5.0                   uint8
category_3_A                     uint8
category_3_B                     uint8
category_3_C                     uint8
dtype: object

In [28]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,category_2_1.0,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_3_A,category_3_B,category_3_C
0,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,1,0,0,0,0,1,0,0
1,1,C_ID_4e6213e9bc,88,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,1,0,0,0,0,1,0,0
2,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,1,0,0,0,0,1,0,0
3,1,C_ID_4e6213e9bc,88,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,1,0,0,0,0,1,0,0
4,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,1,0,0,0,0,1,0,0


In [29]:
agg_fun = {'authorized_flag': ['sum', 'mean']}
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)

In [30]:
#rename the columns
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

In [31]:
authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]

In [32]:
#get seperate datetime info
historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
authorized_transactions['purchase_month'] = authorized_transactions['purchase_date'].dt.month
new_transactions['purchase_month'] = new_transactions['purchase_date'].dt.month

In [33]:
def aggregate_transactions(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['min', 'max']
        }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

In [34]:
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]
history[:5]

Unnamed: 0,card_id,hist_transactions_count,hist_category_1_sum,hist_category_1_mean,hist_category_2_1.0_mean,hist_category_2_2.0_mean,hist_category_2_3.0_mean,hist_category_2_4.0_mean,hist_category_2_5.0_mean,hist_category_3_A_mean,...,hist_installments_std,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_month_std,hist_purchase_date_ptp,hist_purchase_date_min,hist_purchase_date_max,hist_month_lag_min,hist_month_lag_max
0,C_ID_00007093c1,35,4.0,0.114286,0.0,0.0,0.885714,0.0,0.0,0.0,...,0.667367,5.914286,12,1,3.071419,28858113.0,1489250000.0,1518108000.0,-11,0
1,C_ID_0001238066,3,0.0,0.0,0.333333,0.0,0.0,0.0,0.666667,0.0,...,0.0,8.666667,12,2,5.773503,3609150.0,1514660000.0,1518269000.0,-2,0
2,C_ID_0001506ef0,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75,...,0.5,5.5,12,2,4.725816,22098875.0,1496772000.0,1518871000.0,-8,0
3,C_ID_0001793786,27,2.0,0.074074,0.111111,0.296296,0.111111,0.0,0.0,0.814815,...,0.395847,6.481481,9,3,2.375684,16780236.0,1488636000.0,1505416000.0,-7,-1
4,C_ID_000183fdda,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.288689,8.285714,10,8,0.755929,6701589.0,1502099000.0,1508801000.0,-6,-4


In [35]:
authorized = aggregate_transactions(authorized_transactions)
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]
authorized[:5]

Unnamed: 0,card_id,auth_transactions_count,auth_category_1_sum,auth_category_1_mean,auth_category_2_1.0_mean,auth_category_2_2.0_mean,auth_category_2_3.0_mean,auth_category_2_4.0_mean,auth_category_2_5.0_mean,auth_category_3_A_mean,...,auth_installments_std,auth_purchase_month_mean,auth_purchase_month_max,auth_purchase_month_min,auth_purchase_month_std,auth_purchase_date_ptp,auth_purchase_date_min,auth_purchase_date_max,auth_month_lag_min,auth_month_lag_max
0,C_ID_00007093c1,114,24.0,0.210526,0.0,0.0,0.780702,0.0,0.008772,0.0,...,0.795159,6.517544,12,1,3.37149,32627654.0,1487081000.0,1519708000.0,-12,0
1,C_ID_0001238066,120,2.0,0.016667,0.783333,0.0,0.0,0.0,0.15,0.0,...,1.50105,7.275,12,1,4.895483,13110825.0,1506638000.0,1519748000.0,-5,0
2,C_ID_0001506ef0,62,0.0,0.0,0.032258,0.0,0.967742,0.0,0.0,1.0,...,0.0,6.887097,12,1,4.538017,34460275.0,1484411000.0,1518871000.0,-13,0
3,C_ID_0001793786,189,0.0,0.0,0.042328,0.359788,0.063492,0.0,0.0,1.0,...,0.0,6.698413,10,1,2.301491,24487497.0,1484994000.0,1509481000.0,-9,0
4,C_ID_000183fdda,137,4.0,0.029197,0.051095,0.007299,0.905109,0.0,0.007299,0.0,...,2.108912,6.810219,12,1,4.538289,15148616.0,1504444000.0,1519592000.0,-5,0


In [36]:
new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]
new[:5]

Unnamed: 0,card_id,new_transactions_count,new_category_1_sum,new_category_1_mean,new_category_2_1.0_mean,new_category_2_2.0_mean,new_category_2_3.0_mean,new_category_2_4.0_mean,new_category_2_5.0_mean,new_category_3_A_mean,...,new_installments_std,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_min,new_month_lag_max
0,C_ID_00007093c1,2,0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,...,0.0,4.0,4,4,0.0,537024.0,1522754000.0,1523291000.0,2,2
1,C_ID_0001238066,26,2,0.076923,0.769231,0.0,0.0,0.0,0.115385,0.0,...,2.079941,3.346154,4,3,0.485165,5195343.0,1519923000.0,1525118000.0,1,2
2,C_ID_0001506ef0,2,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,3.0,3,3,0.0,471152.0,1521239000.0,1521710000.0,1,1
3,C_ID_0001793786,31,0,0.0,0.483871,0.258065,0.16129,0.0,0.032258,1.0,...,0.0,11.322581,12,11,0.475191,3981096.0,1510761000.0,1514742000.0,1,2
4,C_ID_000183fdda,11,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.29334,3.272727,4,3,0.467099,5106807.0,1519994000.0,1525100000.0,1,2


In [37]:
def aggregate_per_month(history):
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
            'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)
    
    return final_group


In [38]:
final_group =  aggregate_per_month(historical_transactions) 
final_group[:10]

Unnamed: 0,card_id,month_lag_mean,month_lag_std,purchase_amount_count_mean,purchase_amount_count_std,purchase_amount_sum_mean,purchase_amount_sum_std,purchase_amount_mean_mean,purchase_amount_mean_std,purchase_amount_min_mean,...,installments_sum_mean,installments_sum_std,installments_mean_mean,installments_mean_std,installments_min_mean,installments_min_std,installments_max_mean,installments_max_std,installments_std_mean,installments_std_std
0,C_ID_00007093c1,-5.5,3.605551,2.916667,1.621354,-1.200164,0.999049,-0.431906,0.260478,-0.613764,...,3.75,3.107908,1.176389,0.326634,1.0,0.0,1.5,0.797724,0.355262,0.480551
1,C_ID_0001238066,-1.0,1.414214,1.5,0.707107,-0.894964,0.42189,-0.596643,0.0,-0.596643,...,1.5,0.707107,1.0,0.0,1.0,0.0,1.0,0.0,0.0,
2,C_ID_0001506ef0,-3.333333,4.163332,1.333333,0.57735,-0.968594,0.395645,-0.73102,0.015849,-0.732397,...,0.333333,0.57735,0.166667,0.288675,0.0,0.0,0.333333,0.57735,0.707107,
3,C_ID_0001793786,-3.833333,2.316607,4.5,1.974842,-1.97089,2.037261,-0.423501,0.392159,-0.676499,...,0.833333,0.983192,0.291667,0.40052,0.166667,0.408248,0.5,0.547723,0.179558,0.279244
4,C_ID_000183fdda,-5.0,1.414214,3.5,3.535534,-0.47069,0.344856,-0.376174,0.478523,-0.579708,...,9.5,12.020815,2.0,1.414214,1.0,0.0,3.5,3.535534,2.366432,
5,C_ID_00024e244b,-5.0,4.0,2.428571,1.397276,-1.186659,1.21081,-0.558085,0.326273,-0.607249,...,0.714286,1.112697,0.297619,0.419041,0.142857,0.377964,0.428571,0.534522,0.269338,0.312603
6,C_ID_0002709b5a,-7.25,5.057997,1.5,1.0,-0.929675,0.699316,-0.604163,0.156637,-0.616865,...,2.75,2.362908,2.25,2.5,2.25,2.5,2.25,2.5,0.0,
7,C_ID_00027503e2,-4.0,2.915476,3.6,1.516575,-2.669652,1.128393,-0.741249,0.002457,-0.742989,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,C_ID_000298032a,-6.5,3.535534,1.0,0.0,-0.617958,0.093896,-0.617958,0.093896,-0.617958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
9,C_ID_0002ba3c2e,-3.666667,3.559026,2.5,1.516575,-1.547448,0.910264,-0.641704,0.08043,-0.655965,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
final_group.head()

Unnamed: 0,card_id,month_lag_mean,month_lag_std,purchase_amount_count_mean,purchase_amount_count_std,purchase_amount_sum_mean,purchase_amount_sum_std,purchase_amount_mean_mean,purchase_amount_mean_std,purchase_amount_min_mean,...,installments_sum_mean,installments_sum_std,installments_mean_mean,installments_mean_std,installments_min_mean,installments_min_std,installments_max_mean,installments_max_std,installments_std_mean,installments_std_std
0,C_ID_00007093c1,-5.5,3.605551,2.916667,1.621354,-1.200164,0.999049,-0.431906,0.260478,-0.613764,...,3.75,3.107908,1.176389,0.326634,1.0,0.0,1.5,0.797724,0.355262,0.480551
1,C_ID_0001238066,-1.0,1.414214,1.5,0.707107,-0.894964,0.42189,-0.596643,0.0,-0.596643,...,1.5,0.707107,1.0,0.0,1.0,0.0,1.0,0.0,0.0,
2,C_ID_0001506ef0,-3.333333,4.163332,1.333333,0.57735,-0.968594,0.395645,-0.73102,0.015849,-0.732397,...,0.333333,0.57735,0.166667,0.288675,0.0,0.0,0.333333,0.57735,0.707107,
3,C_ID_0001793786,-3.833333,2.316607,4.5,1.974842,-1.97089,2.037261,-0.423501,0.392159,-0.676499,...,0.833333,0.983192,0.291667,0.40052,0.166667,0.408248,0.5,0.547723,0.179558,0.279244
4,C_ID_000183fdda,-5.0,1.414214,3.5,3.535534,-0.47069,0.344856,-0.376174,0.478523,-0.579708,...,9.5,12.020815,2.0,1.414214,1.0,0.0,3.5,3.535534,2.366432,


In [None]:
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, authorized, on='card_id', how='left')
test = pd.merge(test, authorized, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

train = pd.merge(train, final_group, on='card_id', how='left')
test = pd.merge(test, final_group, on='card_id', how='left')

train = pd.merge(train, auth_mean, on='card_id', how='left')
test = pd.merge(test, auth_mean, on='card_id', how='left')

In [None]:
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

In [None]:
print(train.shape)
print(test.shape)

In [79]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
target.to_csv('target.csv', index=False)

In [81]:
train.columns

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'elapsed_time', 'hist_transactions_count', 'hist_category_1_sum',
       'hist_category_1_mean', 'hist_category_2_1.0_mean',
       ...
       'installments_mean_mean', 'installments_mean_std',
       'installments_min_mean', 'installments_min_std',
       'installments_max_mean', 'installments_max_std',
       'installments_std_mean', 'installments_std_std', 'authorized_flag_sum',
       'authorized_flag_mean'],
      dtype='object', length=139)

### Modelling

In [82]:
nfolds = 10
folds = KFold(n_splits= nfolds, shuffle=True, random_state=15)

In [83]:
#settings for lightgbm
param = {'num_leaves': 50,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': 10,
         'learning_rate': 0.005,
         "min_child_samples": 100,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1}

In [84]:
feature_importance_df = np.zeros((train.shape[1], nfolds))
mvalid = np.zeros(len(train))
mfull = np.zeros(len(test))

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds=200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
    
np.save('oof_lgb', oof_lgb)
np.save('predictions_lgb', predictions_lgb)
np.sqrt(mean_squared_error(target.values, oof_lgb))


-
Fold 1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.71607	valid_1's rmse: 3.79195
[200]	training's rmse: 3.63982	valid_1's rmse: 3.74776
[300]	training's rmse: 3.58686	valid_1's rmse: 3.72418
[400]	training's rmse: 3.54636	valid_1's rmse: 3.71121
[500]	training's rmse: 3.51593	valid_1's rmse: 3.70345
[600]	training's rmse: 3.48859	valid_1's rmse: 3.69791
[700]	training's rmse: 3.46646	valid_1's rmse: 3.69348
[800]	training's rmse: 3.44639	valid_1's rmse: 3.69038
[900]	training's rmse: 3.42703	valid_1's rmse: 3.68837
[1000]	training's rmse: 3.40997	valid_1's rmse: 3.68687
[1100]	training's rmse: 3.3935	valid_1's rmse: 3.68564
[1200]	training's rmse: 3.37815	valid_1's rmse: 3.68485
[1300]	training's rmse: 3.36295	valid_1's rmse: 3.68429
[1400]	training's rmse: 3.34833	valid_1's rmse: 3.68384
[1500]	training's rmse: 3.33491	valid_1's rmse: 3.68384
[1600]	training's rmse: 3.32205	valid_1's rmse: 3.68363
[1700]	training's rmse: 3.30925	valid_1's 

3.66931138885437

In [89]:
#ximp = pd.DataFrame()
#ximp['feature'] = train.columns
#ximp['importance'] = feature_importance_df.mean(axis = 1)

#plt.figure(figsize=(14,14))
#sns.barplot(x="importance",
#            y="feature",
#            data=ximp.sort_values(by="importance",
#                                           ascending=False))
#plt.title('LightGBM Features (avg over folds)')
#plt.tight_layout()

In [86]:
xsub = pd.DataFrame()
xsub['card_id']  = test['card_id']
xsub['target'] = predictions_lgb
xsub.to_csv('sub_lgb.csv', index = False)

In [87]:
xsub

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.597241
1,C_ID_130fd0cbdd,-0.267385
2,C_ID_b709037bc5,-1.019097
3,C_ID_d27d835a9f,-0.189081
4,C_ID_2b5e3df5c2,-1.163584
5,C_ID_5814b4f13c,0.208605
6,C_ID_a1b3c75277,-0.025357
7,C_ID_f7cada36d3,0.350860
8,C_ID_9d2bc8dfc4,-0.850134
9,C_ID_6d8dba8475,-0.780043
