In [244]:
import pandas as pd
import numpy as np
import os
import random
from IPython.display import display
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, validation_curve, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.estimators import plot_feature_importances
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [245]:
train_path = '../input/train.csv'
test_path = '../input/test.csv'

# Loading

Load all the necessary csv files

In [246]:
train_data = pd.read_csv(train_path)
train_data.head()

# Preprocessing

1. Check all the dtypes of the input
2. Check all the statistics of data (use `describe(include='all')` of pandas)
3. Check for null values of all columns
4. Check for data imbalance
5. Fill the missing values for each columns 
6. Convert all the categorical input (object dtype) into int
7. Split the data into two sets ---> train and test

In [247]:
train_data.info()

In [248]:
train_data.describe(include='all')

In [249]:
train_data.isnull().sum()

In [250]:
cat_df = train_data.select_dtypes(include=['object'])
cat_df.head()

In [251]:
plt.figure(1)

plt.subplot(331)
sns.countplot(data=train_data, x='A')

plt.subplot(332)
sns.countplot(data=train_data, x='D')

plt.subplot(333)
sns.countplot(data=train_data, x='E')

plt.subplot(334)
sns.countplot(data=train_data, x='F')

plt.subplot(335)
sns.countplot(data=train_data, x='G')

plt.subplot(336)
sns.countplot(data=train_data, x='I')

plt.subplot(337)
sns.countplot(data=train_data, x='J')

plt.subplot(338)
sns.countplot(data=train_data, x='L')

plt.subplot(339)
sns.countplot(data=train_data, x='M')

# Adjust the subplot layout, because the logit one may take more space
# than usual, due to y-tick labels like "1 - 10^{-3}"
plt.subplots_adjust(top=2.0, bottom=0.08, left=0.10, right=0.95, hspace=0.5,
                    wspace=0.5)

plt.show()

In [252]:
sns.countplot(x='P', data=train_data)
plt.show()
print (train_data.P.value_counts())

# Handle Missing Data
1. Easiest way is to drop the rows which contains NAN [a bit problematic if test set also contains NAN values]
2. Fill with most frequent for categorical and median for numerical.
3. Use the avaliable data and build a model to predict the missing values [Promising way! (how to proceed?) ]

In [253]:
cols = ['A', 'D', 'E', 'F', 'G', 'I', 'J', 'L', 'M']
for item in cols:
    train_data[item] = train_data[item].astype("category").cat.codes +1

train_data.head()

In [254]:
# cols = ['A', 'D', 'E', 'F', 'G', 'I', 'J', 'L', 'M']
# lb = LabelEncoder()

# for item in cols:
#     train_data[item] = lb.fit_transform(train_data[item])

# train_data.head()

In [255]:
cats = ['A', 'D', 'E', 'F', 'G']
not_cats = ['B', 'C', 'H', 'N']

In [256]:
train_data.B = train_data.B.replace(0, np.nan)
train_data.N = train_data.N.replace(0, np.nan)

In [257]:
values = train_data[not_cats]
imputer = Imputer(missing_values='NaN', strategy='mean')
transformed_values = imputer.fit_transform(values)
transformed_values = pd.DataFrame(data=transformed_values, columns=not_cats)
print (transformed_values.head())

In [258]:
ss = StandardScaler()
values = transformed_values[not_cats]
transformed = ss.fit_transform(values)
transformed = pd.DataFrame(data=transformed, columns=not_cats)
display(transformed.head())

In [259]:
transformed_values.update(transformed)
transformed_values.head()

In [260]:
train_data.update(transformed_values)
print (len(train_data))
import gc
del transformed_values, transformed
gc.collect()

In [261]:
train_labels = train_data['P']
train_data = train_data.drop(['P'], axis=1)

In [262]:
train, test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.20, stratify=train_labels, random_state=42)
print ('Training', train.shape, y_train.shape)
print ('Testing', test.shape, y_test.shape)

# LightGBM

In [263]:
lg = lgb.LGBMClassifier(silent=False, sigmoid=1.0)

# param_dist = {"max_depth": [1, 2, 4, 5],
#               "learning_rate" : uniform(),
#               "num_leaves": [4, 15, 25],
#               "n_estimators": randint(1, 1001),
#               'lambda_l2': uniform()
# }
# n_iters_search = 10
# random_search = RandomizedSearchCV(lg, n_jobs=-1, param_distributions=param_dist, cv=10, scoring="accuracy", verbose=1, n_iter=n_iters_search)
# random_search.fit(train, y_train)
# print (random_search.best_estimator_)


default_params = { 'max_depth': 6,
                 'booster': 'dart',
                 'max_bin': 700,
                 'learning_rate':0.5,
                 'num_leaves': 200,
                 'n_estimators':250,
                 'lambda_l2':0.05
}


# n_estimators_range = np.linspace(1, 200, 10).astype('int')
# train_score, test_score = validation_curve(lgb.LGBMClassifier(**default_params),
#                                           train_data, train_labels,
#                                           param_name='n_estimators',
#                                           param_range=n_estimators_range,
#                                            cv=10,
#                                            scoring='accuracy'
#                                           )

# train_score_mean = np.mean(train_score, axis=1)
# test_score_mean = np.mean(test_score, axis=1)
# fig = plt.figure(figsize=(10, 6), dpi=100)
# plt.title('Validation Score with eta=0.3')
# plt.xlabel('Number of trees')
# plt.ylabel('Accuracy')

# plt.plot(n_estimators_range, train_score_mean, label='Training score', color='r')
# plt.plot(n_estimators_range, test_score_mean, label='Validation score', color='g')
# plt.legend(loc='best')
# plt.show()

dtrain = lgb.Dataset(train, label=y_train)
cat_features_name = ['A', 'D', 'E', 'F', 'G', 'I', 'J', 'L', 'M']
lgb_model = lgb.train(default_params, dtrain, categorical_feature=cat_features_name)
y_pred = lgb_model.predict(test)
y_pred = (y_pred>0.5).astype('int')
plot_confusion_matrix(y_test, y_pred, normalize=False)
plt.show()

print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
print("\tAccuracy: %1.3f" % accuracy_score(y_test, y_pred))
print("\tROC: %1.3f" % roc_auc_score(y_test, y_pred))

# Xgboost


In [264]:
xg = xgb.XGBClassifier(silent=False, objective='binary:logistic')

# param_dist = {"max_depth": randint(1, 5),
#               "gamma" : [0, 0.5, 1],
#               "n_estimators": randint(1, 1001),
#               "learning_rate": uniform(),
#               "colsample_bytree": uniform(),
#               "subsample":uniform(),
#               "reg_lambda":uniform(),
#               "reg_alpha":uniform()
# }
# n_iter_search = 30
# random_search = RandomizedSearchCV(xg, n_jobs=-1, param_distributions=param_dist, cv=15, scoring="accuracy", verbose=1, n_iter=n_iter_search)
# random_search.fit(train, y_train)
# print (random_search.best_estimator_)

default_param = { 'objective': 'binary:logistic',
                 'max_depth': 2,
                 'booster':'dart',
                 'learning_rate':0.3,
                 'silent':1,
                 'subsample':0.9599494371296681,
                 'colsample_bytree':0.8679014932353589,
                 'gamma': 0,
                 'reg_lambda':0.45590387225044104,
                 'reg_alpha':0.5604423471664549
}

# n_estimators_range = np.linspace(1, 300, 10).astype('int')
# train_score, test_score = validation_curve(xgb.XGBClassifier(**default_param),
#                                           train_data, train_labels,
#                                           param_name='n_estimators',
#                                           param_range=n_estimators_range,
#                                            cv=10,
#                                            scoring='accuracy'
#                                           )

# train_score_mean = np.mean(train_score, axis=1)
# test_score_mean = np.mean(test_score, axis=1)
# fig = plt.figure(figsize=(10, 6), dpi=100)
# plt.title('Validation Score with eta=0.3')
# plt.xlabel('Number of trees')
# plt.ylabel('Accuracy')

# plt.plot(n_estimators_range, train_score_mean, label='Training score', color='r')
# plt.plot(n_estimators_range, test_score_mean, label='Validation score', color='g')
# plt.legend(loc='best')
# plt.show()

#15 cv and 30 iters --> '47 3 9 52'
xgb_model = xgb.XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.770338779380999, gamma=1,
       learning_rate=0.019096748632529414, max_delta_step=0, max_depth=1,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.2825591812041359, reg_lambda=0.07528729199726858,
       subsample=0.43409905314113495)


#new   --> '46 4 7 54'
# xgb_model = xgb. XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
#        colsample_bytree=0.23382821311638435, gamma=1,
#        learning_rate=0.03591284925507787, max_delta_step=0, max_depth=2,
#        min_child_weight=1, missing=None, n_estimators=600, n_jobs=1,
#        nthread=None, objective='binary:logistic', random_state=0,
#        reg_alpha=0.01173117437539295, reg_lambda=0.02099910740625921,
#        scale_pos_weight=1, seed=None, silent=False,
#        subsample=0.9310717459245332)

#best model
# xgb_model = xgb.XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
#        colsample_bytree=0.7835814532405151, gamma=1,
#        learning_rate=0.3, max_delta_step=0, max_depth=5,
#        min_child_weight=1, missing=None, n_estimators=950, n_jobs=1,
#        nthread=None, objective='binary:logistic', random_state=0,
#        reg_alpha=0.08835265866227038, reg_lambda=0.9919841833494273, subsample=0.8716974299211524)

xgb_model.fit(train, y_train)
y_pred = xgb_model.predict(test)
y_pred = (y_pred>0.5).astype('int')
plot_confusion_matrix(y_test, y_pred, normalize=False)
plt.show()

print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
print("\tAccuracy: %1.3f" % accuracy_score(y_test, y_pred))
print("\tROC: %1.3f" % roc_auc_score(y_test, y_pred))

# dtrain = xgb.DMatrix(train, label=y_train)
# dtest = xgb.DMatrix(test, label=y_test)
# num_round = 1000
# watchlist = [(dtrain, 'train'), (dtest, 'test')]
# bst = xgb.train(default_param, dtrain, num_round, watchlist, early_stopping_rounds=10, maximize=False, eval_metric="logloss")
# # make prediction
# y_pred = bst.predict(dtest)
# y_pred = (y_pred>0.5).astype('int')
# plot_confusion_matrix(y_test, y_pred, normalize=False)
# plt.show()

# print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
# print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
# print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
# print("\tAccuracy: %1.3f" % accuracy_score(y_test, y_pred))
# print("\tROC: %1.3f" % roc_auc_score(y_test, y_pred))

'''
45 5
6 55
default_param = { 'objective': 'binary:logistic',
                 'max_depth': 2,
                 'booster': 'dart',
                 'learning_rate':0.5,
                 'silent':1,
                 'subsample':0.9599494371296681,
                 'colsample_bytree':0.8679014932353589,
                 'gamma': 1,
                 'reg_lambda':0.45590387225044104,
                 'reg_alpha':0.5604423471664549
bst model
'''

# Random Forest

In [265]:
import time
from scipy.stats import randint as sp_randint

rf = RandomForestClassifier(n_estimators=50)

# # specify parameters and distributions to sample from
# param_dist = {"max_depth": np.arange(1, 4),
#               "max_features": sp_randint(1, 11),
#               "min_samples_split": sp_randint(2, 11),
#               "min_samples_leaf": sp_randint(1, 11),
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}

# # # run randomized search
# n_iter_search = 20
# random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
#                                    n_iter=n_iter_search, verbose=1)

# start = time.time()
# random_search.fit(train, y_train)
# print("RandomizedSearchCV took %.2f seconds for %d candidates"
#       " parameter settings." % ((time.time() - start), n_iter_search))
# print (random_search.best_estimator_)

# # use a full grid over all parameters
# param_grid = {"max_depth": [3, 13, None],
#               "max_features": [1, 3, 10],
#               "min_samples_split": [2, 3, 10],
#               "min_samples_leaf": [1, 3, 10],
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}

# # run grid search
# grid_search = GridSearchCV(rf, param_grid=param_grid, verbose=1)
# start = time.time()
# grid_search.fit(train, y_train)
# print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
#       % (time.time() - start, len(grid_search.cv_results_['params'])))
# print (grid_search.best_estimator_)

default_param_rf = { 'max_depth': 3,
                 "max_features":2,
                 "min_samples_leaf":2,
                 "min_samples_split":2,
}

# n_estimators_range = np.linspace(1, 200, 10).astype('int')
# train_score, test_score = validation_curve(RandomForestClassifier(**default_param),
#                                           train_data, train_labels,
#                                           param_name='n_estimators',
#                                           param_range=n_estimators_range,
#                                            cv=10,
#                                            scoring='accuracy'
#                                           )

# train_score_mean = np.mean(train_score, axis=1)
# test_score_mean = np.mean(test_score, axis=1)
# fig = plt.figure(figsize=(10, 6), dpi=100)
# plt.title('Validation Score with eta=0.3')
# plt.xlabel('Number of trees')
# plt.ylabel('Accuracy')

# plt.plot(n_estimators_range, train_score_mean, label='Training score', color='r')
# plt.plot(n_estimators_range, test_score_mean, label='Validation score', color='g')
# plt.legend(loc='best')
# plt.show()


rf = RandomForestClassifier(n_estimators=1000, **default_param_rf)
rf.fit(train, y_train)
y_pred = rf.predict(test)
y_pred = (y_pred>0.5).astype('int')
plot_confusion_matrix(y_test, y_pred, normalize=False)
plt.show()
plot_feature_importances(rf, feature_names=train_data.columns)
plt.show()


print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
print("\tAccuracy: %1.3f" % accuracy_score(y_test, y_pred))
print("\tROC: %1.3f" % roc_auc_score(y_test, y_pred))

In [266]:
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [267]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = (lr.predict(test)>0.5).astype('int')

plot_confusion_matrix(y_test, y_pred, normalize=False)
plt.show()
print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
print("\tAccuracy: %1.3f" % accuracy_score(y_test, y_pred))
print("\tROC: %1.3f" % roc_auc_score(y_test, y_pred))

In [268]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
gbc = GradientBoostingClassifier(learning_rate=0.01, n_estimators=500, **default_param_rf)
gbc.fit(train, y_train)
y_pred = (gbc.predict(test)>0.5).astype('int')

plot_confusion_matrix(y_test, y_pred, normalize=False)
plt.show()
print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
print("\tAccuracy: %1.3f" % accuracy_score(y_test, y_pred))
print("\tROC: %1.3f" % roc_auc_score(y_test, y_pred))

In [269]:
clf1 = lgb.LGBMClassifier(**default_params)
clf2 = xgb.XGBClassifier(**default_param)
clf3 = RandomForestClassifier(n_estimators=1000, **default_param_rf)
clf4 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=500, **default_param_rf)
abc = AdaBoostClassifier(learning_rate=0.01, n_estimators=500)
lr = LogisticRegression()
ensemble = SuperLearner(scorer=accuracy_score, random_state=43, verbose=2)

# Build the first layer
ensemble.add([clf1, clf2])
# Attach the final meta estimator
ensemble.add_meta(clf4)

ensemble.fit(train, y_train)
preds = ensemble.predict(test)
print ("Prediction score: %.3f" % accuracy_score(preds, y_test))

In [270]:
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

In [271]:
train = np.array(train)
y_train = np.array(y_train)
test = np.array(test)
y_test = np.array(y_test)

batch_size=32
epochs=1000

model = Sequential()
model.add(Dense(train.shape[1], input_dim=train.shape[1], activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
history = model.fit(train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(test, y_test), verbose=1, callbacks=[es])

In [272]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Test

In [273]:
test_data = pd.read_csv(test_path)
test_data.head()

In [274]:
test_data.isnull().sum()

In [275]:
cols = ['A', 'D', 'E', 'F', 'G', 'I', 'J', 'L', 'M']
for item in cols:
    test_data[item] = test_data[item].astype("category").cat.codes +1

test_data.head()

In [276]:
# cols = ['A', 'D', 'E', 'F', 'G', 'I', 'J', 'L', 'M']
# lb = LabelEncoder()

# for item in cols:
#     test_data[item] = lb.transform(train_data[item])

# test_data.head()

In [277]:
test_data.B = test_data.B.replace(0, np.nan)
test_data.N = test_data.N.replace(0, np.nan)

In [278]:
values = test_data[not_cats]
transformed_test_values = imputer.transform(values)
transformed_test_values = pd.DataFrame(data=transformed_test_values, columns=not_cats)
print (transformed_test_values.head())

In [279]:
ss = StandardScaler()
values = transformed_test_values[not_cats]
transformed_test = ss.fit_transform(values)
transformed_test = pd.DataFrame(data=transformed_test, columns=not_cats)
display(transformed_test.head())

In [280]:
transformed_test_values.update(transformed_test)
transformed_test_values.head()

In [281]:
test_data.update(transformed_test_values)
print (len(test_data))
import gc
del transformed_test_values, transformed_test
gc.collect()

In [282]:
sub = pd.DataFrame(columns=['id', 'P'])
sub['id'] = test_data['id']

In [283]:
test_lgb_pred = lgb_model.predict(test_data)
test_lgb_pred = (test_lgb_pred>0.5).astype('int')
# test_xgb_pred = bst.predict(xgb.DMatrix(test_data))
# test_xgb_pred = (test_xgb_pred>0.5).astype('int')
test_rf_pred = rf.predict(test_data)
test_rf_pred = (test_rf_pred>0.5).astype('int')
test_ens_pred = ensemble.predict(test_data)
test_ens_pred = (test_ens_pred>0.5).astype('int')
test_gbc_pred = gbc.predict(test_data)
test_gbc_pred = (test_gbc_pred>0.5).astype('int')

In [None]:
sub['P'] = test_lgb_pred
sub.to_csv('submit_lgb.csv', index=False)
# sub['P'] = test_xgb_pred
# sub.to_csv('submit_xgb.csv', index=False)
sub['P'] = test_rf_pred
sub.to_csv('submit_rf.csv', index=False)
sub['P'] = test_ens_pred
sub.to_csv('submit_emsemble.csv', index=False)
sub['P'] = test_gbc_pred
sub.to_csv('submit_gbc.csv', index=False)