In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler, Binarizer, KernelCenterer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [0]:
from google.colab import files
files.upload()

{}

In [0]:
!pip install -q kaggle

In [0]:
import os
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [0]:
!chmod 600 /root/.kaggle/kaggle.json
!mkdir santander
!cd santander

In [0]:
!kaggle competitions download -c santander-customer-transaction-prediction

Downloading train.csv.zip to /content
 88% 108M/122M [00:00<00:00, 117MB/s] 
100% 122M/122M [00:01<00:00, 126MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/463k [00:00<?, ?B/s]
100% 463k/463k [00:00<00:00, 62.5MB/s]
Downloading test.csv.zip to /content
100% 122M/122M [00:01<00:00, 113MB/s]



In [0]:
!mv sample_submission.csv.zip santander/
!mv train.csv.zip santander/
!mv test.csv.zip santander/

In [0]:
!mv santander/train.csv.zip santander/train.zip
!mv santander/test.csv.zip santander/test.zip

In [0]:
!unzip santander/train.zip -d santander/
!unzip santander/test.zip -d santander/

Archive:  santander/train.zip
  inflating: santander/train.csv     
Archive:  santander/test.zip
  inflating: santander/test.csv      


In [0]:
!rm santander/train.zip santander/test.zip santander/sample_submission.csv.zip

In [0]:
train = pd.read_csv("santander/train.csv")
test = pd.read_csv("santander/test.csv")

train.columns

Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
       'var_5', 'var_6', 'var_7',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=202)

In [0]:
## Scaling and Standardization
sc = StandardScaler()

train_std = sc.fit_transform(train[train.columns.difference(['ID_code', 'target'])])
test_std = sc.transform(test[test.columns.difference(['ID_code'])])

In [0]:
train_std = pd.DataFrame(train_std)
test_std = pd.DataFrame(test_std)

train_std['target'] = train['target']
train_std['ID_code'] = train['ID_code']
test_std['ID_code'] = test['ID_code']

In [0]:
## Resampling: 

train_pos = train_std[train_std['target'] == 1]
train_neg = train_std[train_std['target'] == 0]

train_std['target'].value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [0]:
## upsample minority class: https://elitedatascience.com/imbalanced-classes
## minority class - positive class
train_pos_upsampled = resample(train_pos, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(train_neg),    # to match majority class
                                 random_state=42) 
 
# Combine majority class with upsampled minority class
train_upsampled = pd.concat([train_neg, train_pos_upsampled])

train_upsampled['target'].value_counts()


1    179902
0    179902
Name: target, dtype: int64

In [0]:
'''## SMOTE resampling

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0, ratio=1.0)
train_upsampled, target = sm.fit_sample(train_std[train_std.columns.difference(['ID_code','target'])], train_std['target'])

'''

In [0]:
'''target = train_upsampled['target']
train_upsampled = train_upsampled.drop(['ID_code', 'target'], axis = 1)'''

In [0]:
train_upsampled = train_upsampled.drop(['ID_code'], axis = 1)
test_1 = test_std.drop(['ID_code'], axis = 1)


In [0]:
def augment(train,num_n=1,num_p=2):
    newtrain=[train]
    
    n=train[train.target==0]
    for i in range(num_n):
        newtrain.append(n.apply(lambda x:x.values.take(np.random.permutation(len(n)))))
    
    for i in range(num_p):
        p=train[train.target>0]
        newtrain.append(p.apply(lambda x:x.values.take(np.random.permutation(len(p)))))
    return pd.concat(newtrain)

In [0]:
## LightGBM
import lightgbm
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [0]:
'''from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold'''

In [0]:
'''result=np.zeros(test_1.shape[0])

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5,random_state=10)
for counter,(train_index, valid_index) in enumerate(rskf.split(train_upsampled, train_upsampled.target),1):
    print (counter)
    
    #Train data
    t=train_upsampled.iloc[train_index]
    t=augment(t)
    trn_data = lightgbm.Dataset(t.drop("target",axis=1), label=t.target)
    
    #Validation data
    v=train_upsampled.iloc[valid_index]
    val_data = lightgbm.Dataset(v.drop("target",axis=1), label=v.target)
    
    #Training
    model = lightgbm.train(param, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 4000)
    result +=model.predict(test)
    '''

In [0]:
'''x, x_val, y, y_val = train_test_split(train_upsampled, train_upsampled.target, test_size=0.2, random_state=0)'''
x = augment(train_upsampled)
train_data = lightgbm.Dataset(x.drop('target', axis=1), label=x.target)
#val_data = lightgbm.Dataset(x_val.drop('target', axis=1), label=x_val.target)
clf = lightgbm.train(param, train_data, 15000, early_stopping_rounds = 3000, valid_sets = [train_data], verbose_eval=5000)

y_pred_lgbm = clf.predict(test_1, num_iteration=clf.best_iteration)


Training until validation scores don't improve for 3000 rounds.
[5000]	training's auc: 0.908438
[10000]	training's auc: 0.916664
[15000]	training's auc: 0.922357
Did not meet early stopping. Best iteration is:
[15000]	training's auc: 0.922357


In [0]:
x.shape

(899510, 201)

In [0]:
'''from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import StackingCVClassifier
from xgboost import XGBClassifier

RANDOM_SEED = 0

np.random.seed(RANDOM_SEED)

clf1 = LogisticRegression()
clf2 = XGBClassifier(objective="binary:logistic", eval_metric='auc', max_depth=4, gamma=5, min_child_weight=50, reg_lambda=1, reg_alpha=0, 
                       booster='gbtree', random_state=0, silent=None, n_estimators=200)
clf3 = GaussianNB()
#####clf4 = lightgbm(param)
#clf4 = lightgbm.train(param, train_data, 10000, early_stopping_rounds = 2000, valid_sets = [train_data, val_data], verbose_eval=5000)

lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4],
                            #use_probas=True,
                            meta_classifier=lr)

'''

In [0]:
'''from sklearn import model_selection
print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['Logistic Regression', 
                       'XGBoost', 
                       'Gaussian Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, train_upsampled, target, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))
    

3-fold cross validation:

Accuracy: 0.78 (+/- 0.00) [Logistic Regression]
Accuracy: 0.83 (+/- 0.00) [XGBoost]
Accuracy: 0.81 (+/- 0.00) [Gaussian Naive Bayes]''''''

In [0]:
'''sclf.fit(train_upsampled.values, target.values)
y_pred_stack = sclf.predict(test_1.values)
'''

In [0]:
submission = pd.DataFrame({'ID_code':test_std.ID_code,'target':y_pred_lgbm})
submission.to_csv('santander/submission.csv', index=False)