# Santander Customer Transaction Prediction
## Can you identify who will make a transaction?

<img src="https://storage.googleapis.com/kaggle-organizations/141/thumbnail.jpg?r=890"
     alt="Markdown Monster icon" width="200px"
     style="float: left; margin-right: 10px;" />
     


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import warnings
warnings.filterwarnings('ignore')
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Load Data & Features

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [None]:
test_df.shape, train_df.shape

In [None]:
train_df.head()

In [None]:
test_df.head()

**Missing data**

In [None]:
train_df.isnull().values.any()

In [None]:
test_df.isnull().values.any()

In [None]:
sns.set_style('whitegrid')
sns.countplot(train_df['target'])
sns.set_style('whitegrid')

In [None]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

# Run Model

In [None]:
train_stats = train_df.describe()
train_stats = train_stats.transpose()
train_stats

In [None]:
train_df.head()

In [None]:
test_id=test_df['ID_code'].values

In [None]:
train=train_df.drop(['ID_code','target'],axis=1)
test=test_df.drop(['ID_code'],axis=1)

In [None]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
train_df = norm(train)
test_df = norm(test)

In [None]:
param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.01,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 15,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }

In [None]:
num_round = 10000
kfold = 13
folds = StratifiedKFold(n_splits=kfold, shuffle=False, random_state=44000)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx], label=target.iloc[val_idx])
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 1000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df, num_iteration=clf.best_iteration) / folds.n_splits


In [None]:
print("\n >> CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

# Submission

In [None]:
submission = pd.DataFrame({"ID_code": test_id})
submission["target"] = predictions
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()