In [None]:
# import package
import numpy as np 
import pandas as pd 
import gc
import datetime
import warnings
import seaborn as sns
import math

from sklearn.preprocessing import OneHotEncoder
from scipy.stats import norm, rankdata

# scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Dimension Reduction
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.cluster import KMeans

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluation
from tqdm import tqdm_notebook
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Model
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, ExtraTreesRegressor                   
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from xgboost import XGBClassifier
#import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from vecstack import stacking


# bayesian optimisation
import hyperopt
from hyperopt import hp
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
import csv
from timeit import default_timer as timer
from hyperopt import STATUS_OK, hp, tpe, Trials, fmin
import ast

from numpy.random import RandomState

warnings.filterwarnings('ignore')

In [None]:
# read data
data_train = pd.read_csv("../input/train.csv", encoding = 'utf-8-sig')
data_test = pd.read_csv("../input/test.csv", encoding = 'utf-8-sig')
data_sample = pd.read_csv("../input/sample_submission.csv", encoding = 'utf-8-sig')

## Check Data
- 1. data shape
- 2. columns
- 3. missing value
- 4. check data type -> model cannot deal with type except float

In [None]:
# [1] data shape

print("Train data shape:", data_train.shape)
print("Test data shape:", data_test.shape)

# [2] check columns: train data get 1 column more than the test data
print("check columns in train data but not in test data: \n",
      data_train.columns[~data_train.columns.isin(data_test.columns)])
print("check columns in test data but not in train data: \n",
      data_test.columns[~data_test.columns.isin(data_train.columns)])

In [None]:
# [3] check missing value
data_check_missing_val = pd.DataFrame(data_train.isnull().sum()/data_train.isnull().count()).rename(columns={0:"percentage of missing value"}).reset_index().rename(columns={"index":"feature_name"})
print("feature with missing value: ", data_check_missing_val[data_check_missing_val["percentage of missing value"]!=0].feature_name)


In [None]:
# [4] check column types -> check data type before scaling
print("train data columns that are not float type: ",
      [col for col in data_train.columns if data_train[col].dtype!=float])

print("test data columns that are not float type: ",
      [col for col in data_test.columns if data_test[col].dtype!=float])

In [None]:
print("---> train data information <--- \n")
print(data_train.info())
print("---> test data information <--- \n")
print(data_test.info())

## Explore Data

- 1. summary statistics -> standard deviation & mean too high
- 2. check imbalance data
- 3. check feature correlation
- 4. check duplicate within a column

In [None]:
# [1]
plt.title("train data: distribution of standard deviation")
sns.distplot(data_train.describe().loc['std'])
plt.show()

plt.title("train data: distribution of mean")
sns.distplot(data_train.describe().loc['mean'])
plt.show()

In [None]:
plt.title("test data: distribution of standard deviation")
sns.distplot(data_test.describe().loc['std'])
plt.show()

plt.title("test data: distribution of mean")
sns.distplot(data_test.describe().loc['mean'])
plt.show()

In [None]:
# [2] check imbalance data
print("--> % of target data in the data <-- \n", 
      data_train["target"].sum()/len(data_train))

sns.countplot(data_train['target'])
plt.show()

In [None]:
# [3] check feature correlation
features = [col for col in data_train.columns if col not in ["ID_code", "target"]]
correlations = data_train[features].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]

In [None]:
correlations[:10]
correlations[:-11:-1]

In [None]:
# [4] check duplicate
val_of_col_train = []
num_of_dup_train = []
val_of_col_test = []
num_of_dup_test = []

for col in features:
    val_of_col_train.append(data_train[col].value_counts().nlargest(1).index.values.tolist())
    num_of_dup_train.append(data_train[col].value_counts().nlargest(1).values.tolist())
    val_of_col_test.append(data_test[col].value_counts().nlargest(1).index.values.tolist())
    num_of_dup_test.append(data_test[col].value_counts().nlargest(1).values.tolist())

In [None]:
data_check_duplicates_train = pd.concat([pd.DataFrame(features),
                                   pd.DataFrame(val_of_col_train),
                                   pd.DataFrame(num_of_dup_train)], axis=1)
col_list = ["col_name", "most_freq_appearance_value", "# of duplicates"]
data_check_duplicates_train.columns = col_list

data_check_duplicates_test = pd.concat([pd.DataFrame(features),
                                   pd.DataFrame(val_of_col_test),
                                   pd.DataFrame(num_of_dup_test)], axis=1)
data_check_duplicates_test.columns = col_list

In [None]:
print("train data: top 10 col with most duplicate values")
data_check_duplicates_train.sort_values(by="# of duplicates", ascending = False)[:10]

In [None]:
print("test data: top 10 col with most duplicate values")
data_check_duplicates_test.sort_values(by="# of duplicates", ascending = False)[:10]

## adding features
- get sum, min, max, ... of each row

In [None]:
for df in [data_train, data_test]:
    df['sum'] = df[features].sum(axis=1)  
    df['min'] = df[features].min(axis=1)
    df['max'] = df[features].max(axis=1)
    df['mean'] = df[features].mean(axis=1)
    df['std'] = df[features].std(axis=1)
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurtosis(axis=1)
    df['med'] = df[features].median(axis=1)


## Model - LGBM
- tree based model no need scaling
- use bayesian optimisation to optimise the parameters

In [None]:
target = data_train['target']
ID = data_test["ID_code"]
features = new_features = [col for col in data_train.columns if col not in ["ID_code", "target"]]

In [None]:
print("# of features in the data:", len(features))

## optimal parameters from kernel

In [None]:
# original (for cases that don't add extra features -> from kaggle kernel)
# for model with 208 features
# if use this for new model with extra features ->  score: 0.9
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [None]:
%%time

# use x fold to check since run so slow
folds = StratifiedKFold(n_splits=1, shuffle=False, random_state=0)
oof = np.zeros(len(data_train))
predictions = np.zeros(len(data_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(data_train.values, target.values)):
    print("Fold {}".format(fold_))
    
    trn_data = lgb.Dataset(data_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(data_train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=1000, early_stopping_rounds = 3000)
    
    # get 10 oof data -> cuz cv=10
    oof[val_idx] = clf.predict(data_train.iloc[val_idx][features], 
                               num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(data_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

In [None]:
submit = pd.DataFrame({"ID_code":data_test["ID_code"].values})
submit["target"] = predictions
submit.to_csv("bayesian_optimisation_lgbm.csv", index=False)