In [1]:
import gc
from datetime import datetime, timedelta,date
import warnings
import itertools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection,johnson_lindenstrauss_min_dim
from sklearn.decomposition import PCA, FastICA,NMF,LatentDirichletAllocation,IncrementalPCA,MiniBatchSparsePCA
from sklearn.decomposition import TruncatedSVD,FactorAnalysis,KernelPCA

import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import StratifiedKFold, KFold

from scipy.stats import ks_2samp
import tqdm

#settings
warnings.filterwarnings('ignore')
np.random.seed(2018)
pd.set_option("display.max_columns", None)

In [2]:
def get_prefix(group_col, target_col, prefix=None):
    if isinstance(group_col, list) is True:
        g = '_'.join(group_col)
    else:
        g = group_col
    if isinstance(target_col, list) is True:
        t = '_'.join(target_col)
    else:
        t = target_col
    if prefix is not None:
        return prefix + '_' + g + '_' + t
    return g + '_' + t
    
def groupby_helper(df, group_col, target_col, agg_method, prefix_param=None):
    try:
        prefix = get_prefix(group_col, target_col, prefix_param)
        print(group_col, target_col, agg_method)
        group_df = df.groupby(group_col)[target_col].agg(agg_method)
        group_df.columns = ['{}_{}'.format(prefix, m) for m in agg_method]
    except BaseException as e:
        print(e)
    return group_df.reset_index()

In [3]:
def get_hist_default_prorcessing(df):
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    df['category_3'] = df['category_3'].map({'A':0, 'B':1, 'C':2}) 
    df['month_diff'] = ((datetime(2019,1,20) - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']
    df['reference_date'] = (df['year']+(df['month'] - df['month_lag'])//12)*100 + (((df['month'] - df['month_lag'])%12) + 1)*1
    return df

In [4]:
def create_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [5]:
historical_trans_df = pd.read_csv('input/historical_transactions.csv')
new_merchant_trans_df = pd.read_csv('input/new_merchant_transactions.csv')
merchant_df = pd.read_csv('input/merchants.csv')[['merchant_id','merchant_group_id','category_4','active_months_lag3','active_months_lag6','active_months_lag12']]

In [6]:
historical_trans_df = historical_trans_df.sort_values('purchase_date')

In [7]:
historical_trans_df = get_hist_default_prorcessing(historical_trans_df)

In [8]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [9]:
monthlag_last = groupby_helper(historical_trans_df,['card_id'],'month_lag',['max'])
historical_trans_df = historical_trans_df.merge(monthlag_last, on='card_id', how='left')
historical_trans_df.loc[historical_trans_df['month_lag']==historical_trans_df['card_id_month_lag_max'],'month_lag_reference'] = 1


monthlag_last = groupby_helper(historical_trans_df.loc[historical_trans_df['month_lag_reference']!=0],['card_id'],'month_lag',['max'],'before')
historical_trans_df = historical_trans_df.merge(monthlag_last, on='card_id', how='left')
historical_trans_df.loc[historical_trans_df['month_lag']==historical_trans_df['before_card_id_month_lag_max'],'month_lag_reference'] = 1
historical_trans_df['month_lag_reference'].fillna(0, inplace=True)

group_df = groupby_helper(historical_trans_df.loc[historical_trans_df['month_lag_reference']==0],['card_id'],'month_lag',['nunique'])
historical_trans_df = historical_trans_df.merge(group_df,on='card_id',how='left')
historical_trans_df.loc[historical_trans_df['card_id_month_lag_nunique'].isnull(),'card_id_month_lag_nunique'] = 0
card_id_month_lag_nunique = groupby_helper(historical_trans_df,['card_id'],'card_id_month_lag_nunique',['max'])

['card_id'] month_lag ['max']
['card_id'] month_lag ['max']
['card_id'] month_lag ['nunique']
['card_id'] card_id_month_lag_nunique ['max']


In [10]:
hist_trans_df_lastmonth = historical_trans_df.loc[historical_trans_df['month_lag_reference']==1]
hist_trans_df_before_lastmonth = historical_trans_df.loc[historical_trans_df['month_lag_reference']==0]

In [16]:
train_df['convert_target'] = np.power(2,train_df['target'])-9.999999992192566e-11
all_df = pd.concat([train_df,test_df])
all_df['feature123'] = all_df['feature_1'].astype(str) +'_'+all_df['feature_2'].astype(str)+'_'+all_df['feature_3'].astype(str)
all_df['feature123'] = pd.factorize(all_df['feature123'])[0]

In [17]:
reference_max = groupby_helper(historical_trans_df,['card_id'],'reference_date','max')
all_df = all_df.merge(reference_max, on='card_id', how='left')

['card_id'] reference_date max


### Subsector

In [74]:
subsector_lastmonth = groupby_helper(hist_trans_df_lastmonth,['card_id','subsector_id'],'month_lag','size')
subsector_lastmonth_pivot = subsector_lastmonth.pivot('card_id','subsector_id','month_lag').reset_index()
subsector_lastmonth_pivot.columns = ['card_id']+['lastmonth_subsector_id_{}'.format(col) for col in subsector_lastmonth_pivot.columns if col != 'card_id']
subsector_lastmonth_pivot.fillna(0, inplace=True)

['card_id', 'subsector_id'] month_lag size


In [75]:
subsector_before_lastmonth = groupby_helper(hist_trans_df_before_lastmonth,['card_id','subsector_id'],'month_lag','size')
subsector_before_lastmonth_pivot = subsector_before_lastmonth.pivot('card_id','subsector_id','month_lag').reset_index()
subsector_before_lastmonth_pivot.columns = ['card_id']+['before_lastmonth_subsector_id_{}'.format(col) for col in subsector_before_lastmonth_pivot.columns if col != 'card_id']
subsector_before_lastmonth_pivot.fillna(0, inplace=True)

['card_id', 'subsector_id'] month_lag size


In [76]:
subsector_id = subsector_lastmonth_pivot.merge(subsector_before_lastmonth_pivot,on='card_id',how='left')

In [77]:
train_df = all_df.loc[all_df['target'].notnull()]

In [78]:
subsector_id = subsector_id.merge(train_df[['card_id','target']],on='card_id',how='left')

In [79]:
subsector_id_train = subsector_id.loc[subsector_id['target'].notnull()]
subsector_id_test = subsector_id.loc[subsector_id['target'].isnull()]

In [80]:
subsector_id_train['outliers'] = 0
subsector_id_train.loc[subsector_id_train['target'] < -30, 'outliers'] = 1

train_columns = [c for c in subsector_id_train.columns if c not in ['card_id', 'first_active_month','target','outliers','rank']]
train_columns

['lastmonth_subsector_id_-1',
 'lastmonth_subsector_id_1',
 'lastmonth_subsector_id_2',
 'lastmonth_subsector_id_3',
 'lastmonth_subsector_id_4',
 'lastmonth_subsector_id_5',
 'lastmonth_subsector_id_7',
 'lastmonth_subsector_id_8',
 'lastmonth_subsector_id_9',
 'lastmonth_subsector_id_10',
 'lastmonth_subsector_id_11',
 'lastmonth_subsector_id_12',
 'lastmonth_subsector_id_13',
 'lastmonth_subsector_id_14',
 'lastmonth_subsector_id_15',
 'lastmonth_subsector_id_16',
 'lastmonth_subsector_id_17',
 'lastmonth_subsector_id_18',
 'lastmonth_subsector_id_19',
 'lastmonth_subsector_id_20',
 'lastmonth_subsector_id_21',
 'lastmonth_subsector_id_22',
 'lastmonth_subsector_id_23',
 'lastmonth_subsector_id_24',
 'lastmonth_subsector_id_25',
 'lastmonth_subsector_id_26',
 'lastmonth_subsector_id_27',
 'lastmonth_subsector_id_28',
 'lastmonth_subsector_id_29',
 'lastmonth_subsector_id_30',
 'lastmonth_subsector_id_31',
 'lastmonth_subsector_id_32',
 'lastmonth_subsector_id_33',
 'lastmonth_subsec

In [81]:
train = subsector_id_train.copy()
target = train['target']
del train['target']

In [82]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.015,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 24,
         "seed": 6}

#prepare fit model with cross-validation
np.random.seed(2019)


feature_importance_df = pd.DataFrame()
folds = KFold(n_splits=9, shuffle=True, random_state=4950)
oof = np.zeros(len(train))
predictions = np.zeros(len(test_df))
cv_score_list = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):
    strLog = "fold {}".format(fold_+1)
    print(strLog)
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    cv_score = np.sqrt(mean_squared_error(oof[val_idx], target.iloc[val_idx]))
    cv_score_list.append(cv_score)
    #feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    #predictions
    predictions += clf.predict(subsector_id_test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

cv_score = np.sqrt(mean_squared_error(oof, target))
print(cv_score)
print(cv_score_list)
print(np.std(cv_score_list))
withoutoutlier_predictions = predictions.copy()

fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.83882	valid_1's rmse: 3.66375
[200]	training's rmse: 3.81774	valid_1's rmse: 3.66304
Early stopping, best iteration is:
[161]	training's rmse: 3.82507	valid_1's rmse: 3.66287
fold 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.81069	valid_1's rmse: 3.89955
[200]	training's rmse: 3.79035	valid_1's rmse: 3.89892
[300]	training's rmse: 3.77432	valid_1's rmse: 3.8998
Early stopping, best iteration is:
[202]	training's rmse: 3.78999	valid_1's rmse: 3.89888
fold 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.80891	valid_1's rmse: 3.90747
[200]	training's rmse: 3.78867	valid_1's rmse: 3.90583
[300]	training's rmse: 3.77278	valid_1's rmse: 3.90567
Early stopping, best iteration is:
[276]	training's rmse: 3.77621	valid_1's rmse: 3.90558
fold 4
Training until validation scores don't improve for 100 rounds.
[100]	train

In [83]:
subsector_train_pred = pd.concat([pd.DataFrame(train['card_id']).reset_index(drop=True),pd.DataFrame(oof, columns=['subsector_id_pred'])],axis=1)
subsector_test_pred = pd.concat([pd.DataFrame(test_df['card_id']).reset_index(drop=True),pd.DataFrame(predictions, columns=['subsector_id_pred'])],axis=1)
subsector_pred = pd.concat([subsector_train_pred,subsector_test_pred])
subsector_pred.to_csv('subsector_id_pred.csv',index=False)

### MerchantCategoryId

In [84]:
subsector_lastmonth = groupby_helper(hist_trans_df_lastmonth,['card_id','merchant_category_id'],'month_lag','size')
subsector_lastmonth_pivot = subsector_lastmonth.pivot('card_id','merchant_category_id','month_lag').reset_index()
subsector_lastmonth_pivot.columns = ['card_id']+['lastmonth_merchant_category_id_{}'.format(col) for col in subsector_lastmonth_pivot.columns if col != 'card_id']
subsector_lastmonth_pivot.fillna(0, inplace=True)

['card_id', 'merchant_category_id'] month_lag size


In [85]:
subsector_before_lastmonth = groupby_helper(hist_trans_df_before_lastmonth,['card_id','merchant_category_id'],'month_lag','size')
subsector_before_lastmonth_pivot = subsector_before_lastmonth.pivot('card_id','merchant_category_id','month_lag').reset_index()
subsector_before_lastmonth_pivot.columns = ['card_id']+['before_lastmonth_merchant_category_id_{}'.format(col) for col in subsector_before_lastmonth_pivot.columns if col != 'card_id']
subsector_before_lastmonth_pivot.fillna(0, inplace=True)

['card_id', 'merchant_category_id'] month_lag size


In [86]:
subsector_id = subsector_lastmonth_pivot.merge(subsector_before_lastmonth_pivot,on='card_id',how='left')

In [87]:
train_df = all_df.loc[all_df['target'].notnull()]

In [88]:
subsector_id = subsector_id.merge(train_df[['card_id','target']],on='card_id',how='left')

In [89]:
subsector_id_train = subsector_id.loc[subsector_id['target'].notnull()]
subsector_id_test = subsector_id.loc[subsector_id['target'].isnull()]

In [90]:
subsector_id_train['outliers'] = 0
subsector_id_train.loc[subsector_id_train['target'] < -30, 'outliers'] = 1

train_columns = [c for c in subsector_id_train.columns if c not in ['card_id', 'first_active_month','target','outliers','rank']]
train_columns

['lastmonth_merchant_category_id_-1',
 'lastmonth_merchant_category_id_2',
 'lastmonth_merchant_category_id_9',
 'lastmonth_merchant_category_id_11',
 'lastmonth_merchant_category_id_14',
 'lastmonth_merchant_category_id_16',
 'lastmonth_merchant_category_id_19',
 'lastmonth_merchant_category_id_21',
 'lastmonth_merchant_category_id_27',
 'lastmonth_merchant_category_id_33',
 'lastmonth_merchant_category_id_34',
 'lastmonth_merchant_category_id_36',
 'lastmonth_merchant_category_id_38',
 'lastmonth_merchant_category_id_40',
 'lastmonth_merchant_category_id_45',
 'lastmonth_merchant_category_id_49',
 'lastmonth_merchant_category_id_52',
 'lastmonth_merchant_category_id_53',
 'lastmonth_merchant_category_id_56',
 'lastmonth_merchant_category_id_57',
 'lastmonth_merchant_category_id_60',
 'lastmonth_merchant_category_id_63',
 'lastmonth_merchant_category_id_67',
 'lastmonth_merchant_category_id_68',
 'lastmonth_merchant_category_id_69',
 'lastmonth_merchant_category_id_71',
 'lastmonth_me

In [91]:
train = subsector_id_train.copy()
target = train['target']
del train['target']

In [92]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.015,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 24,
         "seed": 6}

#prepare fit model with cross-validation
np.random.seed(2019)


feature_importance_df = pd.DataFrame()
folds = KFold(n_splits=9, shuffle=True, random_state=4950)
oof = np.zeros(len(train))
predictions = np.zeros(len(test_df))
cv_score_list = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):
    strLog = "fold {}".format(fold_+1)
    print(strLog)
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    cv_score = np.sqrt(mean_squared_error(oof[val_idx], target.iloc[val_idx]))
    cv_score_list.append(cv_score)
    #feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    #predictions
    predictions += clf.predict(subsector_id_test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

cv_score = np.sqrt(mean_squared_error(oof, target))
print(cv_score)
print(cv_score_list)
print(np.std(cv_score_list))
withoutoutlier_predictions = predictions.copy()

fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.83379	valid_1's rmse: 3.66421
[200]	training's rmse: 3.80959	valid_1's rmse: 3.66558
Early stopping, best iteration is:
[110]	training's rmse: 3.83095	valid_1's rmse: 3.66411
fold 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.80703	valid_1's rmse: 3.89815
[200]	training's rmse: 3.78399	valid_1's rmse: 3.895
[300]	training's rmse: 3.76623	valid_1's rmse: 3.89458
Early stopping, best iteration is:
[275]	training's rmse: 3.77036	valid_1's rmse: 3.89435
fold 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.80558	valid_1's rmse: 3.90321
[200]	training's rmse: 3.78215	valid_1's rmse: 3.90002
[300]	training's rmse: 3.76451	valid_1's rmse: 3.89886
[400]	training's rmse: 3.74979	valid_1's rmse: 3.89881
Early stopping, best iteration is:
[340]	training's rmse: 3.75831	valid_1's rmse: 3.89844
fold 4
Training until valid

In [93]:
subsector_train_pred = pd.concat([pd.DataFrame(train['card_id']).reset_index(drop=True),pd.DataFrame(oof, columns=['merchant_category_id_pred'])],axis=1)
subsector_test_pred = pd.concat([pd.DataFrame(test_df['card_id']).reset_index(drop=True),pd.DataFrame(predictions, columns=['merchant_category_id_pred'])],axis=1)
subsector_pred = pd.concat([subsector_train_pred,subsector_test_pred])
subsector_pred.to_csv('merchant_category_id_pred.csv',index=False)