This notebook was part of UpGrad initiative to their first cohort getting started with Kaggle competitions. To be compliant with rules, I am sharing everything that was discussed during those sessions. 

** Why do Kaggle**

* Learning new things
* strenghtnen intuition for ml algorithms and techniques
* like competing with fellow kagglers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

** Problem statement **
https://www.kaggle.com/c/santander-customer-transaction-prediction



In [None]:
# Lets go ahead  and have a look at data
DATA_PATH = "../input/santander-customer-transaction-prediction/"  

train = pd.read_csv(str(Path(DATA_PATH) / "train.csv"))
test = pd.read_csv(str(Path(DATA_PATH) / "test.csv"))

print("Train and test shapes", train.shape, test.shape)

In [None]:
train.columns, test.columns

In [None]:
train.target.value_counts()

In [None]:
# https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
def woe(X, y):
    tmp = pd.DataFrame()
    tmp["variable"] = X
    tmp["target"] = y
    var_counts = tmp.groupby("variable")["target"].count()
    var_events = tmp.groupby("variable")["target"].sum()
    var_nonevents = var_counts - var_events
    tmp["var_counts"] = tmp.variable.map(var_counts)
    tmp["var_events"] = tmp.variable.map(var_events)
    tmp["var_nonevents"] = tmp.variable.map(var_nonevents)
    events = sum(tmp["target"] == 1)
    nonevents = sum(tmp["target"] == 0)
    tmp["woe"] = np.log(((tmp["var_nonevents"])/nonevents)/((tmp["var_events"])/events))
    tmp["woe"] = tmp["woe"].replace(np.inf, 0).replace(-np.inf, 0)
    tmp["iv"] = (tmp["var_nonevents"]/nonevents - tmp["var_events"]/events) * tmp["woe"]
    iv = tmp.groupby("variable")["iv"].last().sum()
    return tmp["woe"], tmp["iv"], iv

In [None]:
iv_values = []
feats = ["var_{}".format(i) for i in range(200)]
y = train["target"]
for f in feats:
    X = pd.qcut(train[f], 10, duplicates='drop')
    _, _, iv = woe(X, y)
    iv_values.append(iv)
    
iv_inds = np.argsort(iv_values)[::-1][:50]
iv_values = np.array(iv_values)[iv_inds]
feats = np.array(feats)[iv_inds]


In [None]:
plt.figure(figsize=(10, 16))
sns.barplot(y=feats, x=iv_values, orient='h')
plt.show()

## EDA

### Pointers
* Check out existing kernels
https://www.kaggle.com/gpreda/santander-eda-and-prediction
https://www.kaggle.com/artgor/santander-eda-fe-fs-and-models
https://www.kaggle.com/mjbahmani/santander-ml-explainability

* Check distributions
* Compare train and test distributions
* Identify important features (Most of the times feature engineering is going to be around features with high predictive power)
* Attach a logic to why featurea are important ( Note: data is anonymised  here so hard to do this)
* Check previous solutions to similar problems


### Observations
* Data normalization and imputation
* Weak corelations between features and target
* IV values ??
* Most variables have distribution close to normal
* Almost no corelation between differnt variable - What does it mean ??
* No NA values (already imputed??)
* Some features seem to have been clipped at one end
* Spikes in distributions (imputed values??)
* less unique 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
feats = ["var_{}".format(i) for i in range(200)]
X = train[feats]
X_test = test[feats]
y = train["target"]

cvlist = list(StratifiedKFold(5, random_state=12345786).split(X, y))
scaler = StandardScaler()

X_sc = scaler.fit_transform(X)
X_test_sc = scaler.fit_transform(X_test)

lr = LogisticRegression()
y_preds_lr = cross_val_predict(lr, X_sc, y, cv=cvlist, method="predict_proba")[:, 1]

lr.fit(X_sc, y)
y_test_preds_lr = lr.predict_proba(X_test_sc)[:, 1] 
roc_auc_score(y, y_preds_lr)

In [None]:
sns.distplot(y_preds_lr)
sns.distplot(y_test_preds_lr)
plt.show()

### Method -1 : train on full and predict on test
 - rule  - scale boosting rounds by train data ratio to data during validation - 1500 

In [None]:
import lightgbm as lgb
#model = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.1, num_leaves=2, subsample=0.4, colsample_bytree=0.4)

#y_preds_lgb = np.zeros((len(y)))
#for i, (tr_idx, val_idx) in enumerate(cvlist):
#    X_dev, y_dev = X.iloc[tr_idx], y.iloc[tr_idx]
#    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
#    model.fit(X_dev, y_dev, eval_set=[(X_val, y_val)], eval_metric="auc", verbose=50, early_stopping_rounds=200)
#    val_preds = model.predict_proba(X_val)[:, 1]
#    y_preds_lgb[val_idx] = val_preds
#    print("Score for fold {} is {}".format(i, roc_auc_score(y_val, val_preds)))
    
#print("Overall Score for oof predictions ", roc_auc_score(y, y_preds_lgb))

In [None]:
#model = lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.1, num_leaves=8, subsample=0.6, colsample_bytree=0.6)
#model.fit(X, y)
#y_test_preds_lgb = model.predict_proba(X_test)[:, 1]


In [None]:
#sns.distplot(y_preds)
#sns.distplot(y_test_preds_lgb)

### Method 2 - use validation fold models to predict on test set


In [None]:
from scipy.stats import gmean

In [None]:
np.mean([0.9, 0.9, 0.9, 0.98, 0.9])

In [None]:
gmean([0.9, 0.9, 0.9, 0.98, 0.9])

In [None]:
import lightgbm as lgb
model = lgb.LGBMClassifier(n_estimators=200000, learning_rate=0.05, num_leaves=2, subsample=0.45, colsample_bytree=0.45)

y_preds_lgb = np.zeros((len(y)))
test_preds_allfolds = []
for i, (tr_idx, val_idx) in enumerate(cvlist):
    X_dev, y_dev = X.iloc[tr_idx], y.iloc[tr_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    model.fit(X_dev, y_dev, eval_set=[(X_val, y_val)], eval_metric="auc", verbose=50, early_stopping_rounds=200)
    val_preds = model.predict_proba(X_val)[:, 1]
    test_preds = model.predict_proba(X_test)[:, 1]
    test_preds_allfolds.append(test_preds)
    y_preds_lgb[val_idx] = val_preds
    print("Score for fold {} is {}".format(i, roc_auc_score(y_val, val_preds)))
    # break
print("Overall Score for oof predictions ", roc_auc_score(y, y_preds_lgb))

In [None]:
y_test_preds_lgb = gmean(test_preds_allfolds, 0)
sns.distplot(y_preds_lgb)
sns.distplot(y_test_preds_lgb)

In [None]:
sub = test[["ID_code"]]
sub["target"] = y_test_preds_lgb2
sub.to_csv("submission_lgbm2_v1.csv", index=False)

### Modelling

Pointers:
*  Validation strategy -- Random KFold, holdout or temporal split ??
* What to trust validation score or LB socre?? trust score from more data; if test data is more we should treat LB as additional fold
* Hyperparamter tuning -- Combination of manual tuning and bayesian optimization libraries like `hyperopt` and `scikit-optimize`. Initial tuninng on single fold and then move to 5 folds.
* Always check validation and test set prediction distributions
* ** Read forums and participate in discussions **

In [None]:
weighted_preds = y_preds_lr* 0.05 + y_preds * 0.95
roc_auc_score(y, weighted_preds)

In [None]:
public_sub = pd.read_csv("../input/santander-lgb-new-features-rank-mean-10-folds/submission_LGBM.csv")
public_sub.head()

In [None]:
sub["target"] = 0.1*sub["target"] + 0.9*public_sub["target"]
sub.to_csv("submission_blend.csv", index=False)