## Santander Customer Transaction Prediction


##  **Import libraries**
---

In [None]:
import gc
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE

### **Specify the seed for Reproducing the result**

In [None]:
gc.enable()
#For Repreduciton the 
seed = 127
np.random.seed(seed)

### ***Load Data***
Below section of code will load the data 

In [None]:
# Load Data
# train_df = pd.read_csv("../input/train.csv",nrows=3000)
# test_df = pd.read_csv("../input/test.csv",nrows=3000) 

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv") 

### ***Data Insight***
Pandas describe funcation will help us in understanding the data and getting insight  like Mean STD and count etc.

In [None]:
train_df.describe()

## **Data visualization **

1.  Dataset is imblanced as class 0 has aprox 175000 sample and class 1 has near about 25000 sample. 

In [None]:
sns.countplot(x='target',data=train_df)

> ###  As data set is imblance, we will use cross-validation instead of traditional train test split. 

In [None]:
X = train_df.iloc[:, 2:]
y = train_df.target

In [None]:
# Adding list of Ids to a new dataframe  for submission 
ids = test_df.ID_code.to_frame()
# Removing Id_code feature from Test data
test_df.drop('ID_code', inplace=True, axis=1)

In [None]:
# Normalization of features
sc = StandardScaler(seed, with_std=True, with_mean=True)
sc.fit(X)
train_df= sc.transform(X) 
test_df = sc.transform(test_df)

In [None]:
train_df.shape

In [None]:
# List fo HyperParam for LightGBM 
params = {
    'tree_learner': 'serial',
    'objective': 'binary',
    'learning_rate': 0.00742,
    'num_leaves': 3,  # Lower value for better accuracy
    'bagging_freq': 5,
    'bagging_fraction': 0.33,
    'boost_from_average':True,
    'boost': 'gbrt',
    'feature_fraction': 0.04,    
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 85,
    'min_sum_hessian_in_leaf': 10.0,
    'num_threads': 16,
    'verbosity': 1
}

In [None]:
num_round = 2000000  
# Cross-validation
folds = StratifiedKFold(n_splits=11, shuffle=True, random_state=seed)
lstCV=folds.split(train_df, y)

In [None]:
# Train and Test Predication Vector
train_pred = np.zeros(len(X))
test_pred = np.zeros(len(test_df))

In [None]:
# Traning LightGBM  with the help of StratifiedKFold
for fold_, (trn_idx, val_idx) in enumerate(lstCV):
    print("Fold Index {}".format(fold_))
    trn_data = lgb.Dataset(train_df[trn_idx] , label=y[trn_idx])
    val_data = lgb.Dataset(train_df[val_idx] , label=y[val_idx])
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 6000)
    train_pred[val_idx] = clf.predict(train_df[val_idx], num_iteration=clf.best_iteration)
    test_pred += clf.predict(test_df, num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.10f}".format(roc_auc_score(y, train_pred)))

In [None]:
sub_df = pd.DataFrame({"ID_code": ids["ID_code"]})
sub_df["target"] = test_pred
sub_df.to_csv("submission.csv", index=False)