In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from collections import Counter, defaultdict
from sklearn.metrics import roc_auc_score
from scipy.stats import norm, rankdata
import lightgbm
import xgboost
from sklearn import metrics
from sklearn.preprocessing import Imputer
from sklearn import metrics
import gc
import os
print(os.listdir("../input"))

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [None]:
train_df.head()

In [None]:
target = train_df.pop('target')
len_train = len(train_df)

train_ids = train_df['ID_code']
test_ids = test_df['ID_code']

del train_df['ID_code']
del test_df['ID_code']
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
mean_train = train_df.mean()
mean_test = test_df.mean()

plt.figure(figsize=(15,10))
plt.plot(mean_train, color='blue')
plt.plot(mean_test, color = 'pink')
plt.show()

In [None]:
pd.DataFrame([mean_test, mean_train])

In [None]:
std_train = train_df.std()
std_test = test_df.std()

plt.figure(figsize=(15,10))
plt.plot(std_train, color='blue')
plt.plot(std_test, color = 'pink')
plt.show()

In [None]:
pd.DataFrame([std_test, std_train])

In [None]:
test_df = (test_df - mean_test) + (mean_train)

train_df = train_df / (std_train)
test_df = test_df / (std_train)

In [None]:
merged_df = pd.concat([train_df, test_df])

In [None]:
merged_df.shape, train_df.shape, test_df.shape

In [None]:
# Add more features
for col in merged_df.columns:
    # Normalize the data, so that it can be used in norm.cdf(), as though it is a standard normal variable
    merged_df[col] = ((merged_df[col] - merged_df[col].mean()) / merged_df[col].std()).astype('float32')
    
    #Square Root
    merged_df[col+'_sq'] = merged_df[col] **(1/2)

    # Square
    merged_df[col+'_s'] = merged_df[col] * merged_df[col]

    # Cube
    merged_df[col+'_c'] = merged_df[col] * merged_df[col] * merged_df[col]

    # 4th power
    merged_df[col+'_q'] = merged_df[col] * merged_df[col] * merged_df[col] * merged_df[col]

    # Cumulative percentile (not normalized)
    merged_df[col+'_r'] = rankdata(merged_df[col]).astype('float32')

    # Cumulative normal percentile
    merged_df[col+'_n'] = norm.cdf(merged_df[col]).astype('float32')

In [None]:
gc.collect()

In [None]:
merged_df.shape

In [None]:
param = {
        'num_leaves': 10,
        'max_bin': 63,
        'min_data_in_leaf': 11,
        'learning_rate': 0.02,
        'min_sum_hessian_in_leaf': 0.00245,
        'bagging_fraction': 1.0, 
        'bagging_freq': 5, 
        'feature_fraction': 0.05,
        'lambda_l1': 4.972,
        'lambda_l2': 2.276,
        'min_gain_to_split': 0.65,
        'max_depth': 14,
        'save_binary': True,
        'seed': 1337,
        'feature_fraction_seed': 49,
        'bagging_seed': 49,
        'drop_seed': 49,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [None]:
param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.0095,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }

In [None]:
x = merged_df.iloc[:len_train]
x_test = merged_df.iloc[len_train:]
y = target

In [None]:
del merged_df
del target
del train_df
del test_df
gc.collect()

In [None]:
nfold =5
k =StratifiedKFold(n_splits=nfold, shuffle=True, random_state=49)

oof = np.zeros(len(y))
predictions = np.zeros(len(x_test))

i =1

for train_idx, val_idx in k.split(x, y.values):
    print("\n fold {}".format(i))
    
    light_train = lightgbm.Dataset(x.iloc[train_idx].values,
                                  label = y.iloc[train_idx].values,
                                  free_raw_data = False)
    light_val = lightgbm.Dataset(x.iloc[val_idx].values,
                                label = y.iloc[val_idx].values,
                                free_raw_data = False)
    
    clf = lightgbm.train(param, light_train, 10000, valid_sets=[light_val], verbose_eval=50, early_stopping_rounds=50)
    
    oof[val_idx] = clf.predict(x.iloc[val_idx].values, num_iteration = clf.best_iteration)
    
    predictions += clf.predict(x_test.values, num_iteration= clf.best_iteration) /nfold
    
    i+=1

In [None]:
gc.collect()

In [None]:
sub_df = pd.DataFrame()
sub_df['ID_code'] = test_ids
sub_df['target'] = predictions
sub_df.to_csv("sub1.csv", index=False)