In [None]:
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata

In [None]:
train_df = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/test.csv")

features = [x for x in train_df.columns if x.startswith("var")]

In [None]:
# count all values
var_stats = {}
hist_series_list = []

for var in features:
    var_stats[var] = pd.concat([train_df[var], test_df[var]]).value_counts()
    hist_series = pd.Series(test_df[var]).map(var_stats[var])
    hist_series = hist_series > 1
    hist_series_list.append(hist_series)

# Create the DataFrame in a single operation
hist_df = pd.concat(hist_series_list, axis=1)
hist_df.columns = features

In [None]:
# Calculate correlations with the target
correlations = train_df[features].corrwith(train_df['target']).to_dict()

In [None]:
def logit(p):
    return np.log(p) - np.log(1 - p)

def var_to_feat(vr, var_stats, feat_id, correlation):
    new_df = pd.DataFrame()
    new_df["var"] = vr.values
    new_df["hist"] = pd.Series(vr).map(var_stats)
    new_df["feature_id"] = feat_id
    new_df["var_rank"] = new_df["var"].rank() / 200000.0
    new_df["weighted_rank"] = new_df["var_rank"] * correlation
    # Each row follow the format [initial value, number of apparition of this value, feature ID (Only the number), normalized rank of the value, correlation with target]
    return new_df.values  # df into an array


In [None]:
TARGET = np.array( list(train_df['target'].values) * 200 ) # number of sample * number of variables

TRAIN = []
var_mean = {}
var_var  = {}
for var in features:
    tmp = var_to_feat(train_df[var], var_stats[var], int(var[4:]), correlations[var]) # feature follow the format 'val_27' so int(var[4:]) is a way to get the features index
    var_mean[var] = np.mean(tmp[:,0])                   # compute mean
    var_var[var]  = np.var(tmp[:,0])                    # compute variance
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]    # apply standardization
    TRAIN.append( tmp ) # append standardized columns in the train list
TRAIN = np.vstack( TRAIN )

del train_df
gc.collect()

print(TRAIN.shape, len(TARGET) )

In [None]:
model = lgb.LGBMClassifier(**{
     'learning_rate': 0.04,
     'num_leaves': 31,
     'max_bin': 1023,
     'min_child_samples': 1000,
     'reg_alpha': 0.1,
     'reg_lambda': 0.2,
     'feature_fraction': 1.0,
     'bagging_freq': 1,
     'bagging_fraction': 0.85,
     'objective': 'binary',
     'n_jobs': -1,
     'n_estimators':200,
     'verbose':-1,})

MODELS = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(TRAIN, TARGET)):
    print('Fold:', fold_ )
    model = model.fit( TRAIN[train_indexes], TARGET[train_indexes],
                      eval_set = (TRAIN[valid_indexes], TARGET[valid_indexes]),
                      eval_metric='auc',
                      categorical_feature = [2] )
    MODELS.append( model )

del TRAIN, TARGET
_=gc.collect()

In [None]:
# There is so 10 models
ypred = np.zeros( (200000,200) )
for feat,var in enumerate(features):
    tmp = var_to_feat(test_df[var], var_stats[var], int(var[4:]), correlations[var])
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    for model_id in range(10):
        model = MODELS[model_id]
        ypred[:,feat] += model.predict_proba( tmp )[:,1] / 10.
ypred = np.mean( logit(ypred), axis=1 )

In [None]:
sub = test_df[['ID_code']]
sub['target'] = ypred
sub['target'] = sub['target'].rank() / 200000.
sub.to_csv('golden_sub.csv', index=False)
sub.to_csv('submission.csv', index=False)