In [1]:
import os
import gc
import logging
import time
import math
from operator import itemgetter

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from scipy.stats.kde import gaussian_kde

from sklearn import metrics
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, accuracy_score, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
#from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.neighbors import KernelDensity
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#from sklearn import cluster

from keras.layers import Activation, Dropout, Flatten, Dense, GlobalMaxPooling2D, BatchNormalization, Input, Conv2D
from keras import callbacks
from keras import metrics as keras_metrics
from keras.optimizers import Adam
from keras import backend as K
import keras
from keras.models import Model, Sequential
from keras.models import model_from_json
from keras import regularizers
from keras.losses import binary_crossentropy
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [3]:
batch_size = 100

In [4]:
train_df = pd.read_csv('../input/train.csv')

In [5]:
test_df = pd.read_csv('../input/test.csv')

In [6]:
def auc(y_true, y_pred): 
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [7]:
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1 + epoch) / epochs_drop))
    return lrate

In [8]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.lr = []
 
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.lr.append(step_decay(len(self.losses)))

In [9]:
def simply_blend(prediction_dataframes, weights, target_column_name='target'):
    blended_prediction_df = pd.DataFrame(data=np.zeros(prediction_dataframes[0].shape[0]), columns=[target_column_name])
    for prediction_df, weight in zip(prediction_dataframes, weights):
        blended_prediction_df[target_column_name] = blended_prediction_df[target_column_name] + weight * prediction_df[target_column_name]
        #blended_prediction_df.add(weight * prediction_df)
    return blended_prediction_df / sum(weights)

In [10]:
def train_keras_nn(train_features, train_targets, train_meta_df, train_meta_column_name, model, batch_size=100, epochs=20, n_splits=5):
    loss_history = LossHistory()
    lrate = LearningRateScheduler(step_decay)
    callbacks_list = [EarlyStopping(monitor='val_auc', patience=20, mode='max'), loss_history, annealer]
    sss = StratifiedShuffleSplit(n_splits=n_splits)
    start_time = time.time()
    print(type(train_features))
    for train_index, val_index in sss.split(train_features, train_targets):
        #print(type(train_index))
        #print(train_index)
        X_train = train_features[train_index]
        X_val = train_features[val_index]
        Y_train = train_targets[train_index]
        Y_val = train_targets[val_index]
        #X_tr, Y_tr = augment(X_train, Y_train)
        #print("{} iteration".format(i+1))
        history= model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, verbose=1, validation_data=(X_val,Y_val))
        predictions = model.predict(X_val)[:, 0]
        print("val_index.shape: ", val_index.shape)
        print("predictions.shape: ", predictions.shape)
        train_meta_df[train_meta_column_name].iloc[val_index] = predictions
        #history= sequential_nn_model.fit(X_train, Y_train, batch_size=batch_size, epochs=50, callbacks=callbacks_list, verbose=1, validation_data=(X_val,Y_val))
        del X_train, X_val, Y_train, Y_val
        gc.collect()
    print("Run time {} min".format((time.time() - start_time) / 60))
    return model, train_meta_df

In [11]:
def train_lgbm(train_df, test_df, target, features, param, num_round=1000000):
    start_time = time.time()
    folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    feature_importance_df = pd.DataFrame()
    lgb_classifier = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        num_round = num_round
        clf = lgb.train(
            param,
            trn_data,
            num_round,
            valid_sets=[trn_data, val_data],
            verbose_eval=1000,
            early_stopping_rounds=3000
        )
        lgb_classifier = clf
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        fold_importance_df = pd.DataFrame()
        fold_importance_df['Feature'] = features
        fold_importance_df['importance'] = clf.feature_importance()
        fold_importance_df['fold'] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("Total run time {} min:".format((time.time() - start_time) / 60))
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    return oof, predictions, feature_importance_df, clf

In [12]:
def calculate_pdf_difference(feat, df_feature, df_target, IQR_multiplier, bin_bandwidth_multiplier, print_number_bins):
    #Agreggating feature values in bin format using the Freedman-Diaconis rule
    IQR = df_feature[feat].quantile([0.75]).values - df_feature[feat].quantile([0.25]).values #Interquartile range (IQR)
    n = len(df_feature[feat])
    bin_size = IQR_multiplier*IQR/n**(1/3)
    bin_number = int(np.round((df_feature[feat].max() - df_feature[feat].min())/bin_size))
    binvalues = pd.cut(df_feature[feat], bins = bin_number, labels = range(bin_number)).astype('float')
    
    if print_number_bins:
        print('There are {} bins in the feature {}'.format(bin_number, feat))

    #Calculate the PDFs using the df_target
    pdf_0 = KernelDensity(kernel='gaussian', bandwidth=bin_size*bin_bandwidth_multiplier)
    pdf_0.fit(np.array(df_target[feat][df_target['target'] == 0]).reshape(-1,1))

    pdf_1 = KernelDensity(kernel='gaussian', bandwidth=bin_size*bin_bandwidth_multiplier)
    pdf_1.fit(np.array(df_target[feat][df_target['target'] == 1]).reshape(-1,1))

    #Creates an X array with the average feature value for each bin
    x = np.array(np.arange(min(df_feature[feat]) + bin_size/2 ,max(df_feature[feat]), bin_size)).reshape(-1,1)

    #gets the pdf values based on the X array
    log_pdf_0 = np.exp(pdf_0.score_samples(x))
    log_pdf_1 = np.exp(pdf_1.score_samples(x))

    #creates a dictionary that links the bin number with the PDFs value difference
    pdf_dict = dict()
    for i in range(bin_number):
        pdf_dict[i] = log_pdf_1[i] - log_pdf_0[i] 

    #gets the PDF difference for each row of the dataset based on its equivalent bin.
    bin_pdf_values = np.array(itemgetter(*list(binvalues))(pdf_dict))

    return bin_pdf_values, x, log_pdf_0, log_pdf_1

In [13]:
#features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
train_features_df = train_df[train_df.columns.drop(['ID_code', 'target'])].astype(np.float32)

In [14]:
target_series = train_df['target']

In [15]:
train_id_codes = train_df['ID_code']

In [16]:
train_values, holdout_test_values, train_target_values, holdout_test_target_values = train_test_split(
    train_features_df.values,
    target_series.values,
    test_size=0.2,
    random_state=0
)

In [17]:
train_values_df, holdout_test_df, train_target_series, holdout_test_target_series = train_test_split(
    train_features_df,
    target_series,
    test_size=0.2,
    random_state=0
)

In [18]:
input_dim = train_values.shape[1]

In [19]:
kernel_regularizer=regularizers.l2(0.01)
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(input_dim,)))
#model.add(PreLU(alpha=.001))
model.add(Dropout(0.6))
model.add(BatchNormalization())
model.add(Dense(64, input_shape=(input_dim / 2, ), activation='relu'))
#model.add(PreLU(alpha=.001))
model.add(Dropout(0.6))
model.add(BatchNormalization())
model.add(Dense(32, input_shape=(input_dim / 4, ), activation='relu'))
#model.add(PreLU(alpha=.001))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

annealer = LearningRateScheduler(lambda x: 1e-2 * 0.95 ** x)

In [20]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auc])

In [21]:
sequential_nn_model = Sequential()
sequential_nn_model.add(Dense(batch_size, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
sequential_nn_model.add(Dropout(0.1))
#sequential_nn_model.add(Dropout(0.4))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=input_dim / 2, kernel_initializer='normal', activation='sigmoid'))
####sequential_nn_model.add(Dense(batch_size, input_dim=input_dim / 10, kernel_initializer='normal', activation='sigmoid'))
sequential_nn_model.add(Dropout(0.1))
####sequential_nn_model.add(Dropout(0.4))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=input_dim / 4, kernel_initializer='normal', activation='relu'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=input_dim / 4, kernel_initializer='normal', activation='sigmoid'))
#sequential_nn_model.add(Dense(batch_size, input_shape=(100, 200), kernel_initializer='normal', activation='sigmoid'))
#sequential_nn_model.add(Dropout(0.76))
#sequential_nn_model.add(Dropout(0.24))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=input_dim / 30, kernel_initializer='normal', activation='relu'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=input_dim / 30, kernel_initializer='normal', activation='sigmoid'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

In [22]:
sequential_nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auc])

In [23]:
def make_sequential_model_min(input_dim):
    sequential_nn_model_min = Sequential()
    sequential_nn_model_min.add(Dense(batch_size, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    sequential_nn_model_min.add(Dropout(0.4))
    sequential_nn_model_min.add(BatchNormalization())
    sequential_nn_model_min.add(Dense(batch_size, input_dim=input_dim / 10, kernel_initializer='normal', activation='sigmoid'))
    sequential_nn_model_min.add(Dropout(0.4))
    sequential_nn_model_min.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    return sequential_nn_model_min

In [24]:
sequential_nn_model_min = make_sequential_model_min(input_dim)

In [25]:
sequential_nn_model_min.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auc])

In [26]:
train_meta_df = train_features_df.copy()

In [27]:
train_meta_df['nn_meta'] = np.NAN

In [28]:
train_meta_df['bayes_meta'] = np.NAN

In [29]:
#train_meta_df['nn_meta'].head()

In [30]:
#prediction = pd.Series([0.2, 0.22, 0.3, 0.4])

In [31]:
#train_meta_df['nn_meta'].iloc[[2, 3, 4, 5]]

In [32]:
#train_meta_df['nn_meta'].iloc[[2, 3, 4, 5]] = prediction.values

In [33]:
#train_meta_df['nn_meta'].iloc[[2, 3, 4, 5]]

In [34]:
#train_meta_df['nn_meta'].iloc[[0, 1]]

In [35]:
#train_values, holdout_test_values, train_target_values, holdout_test_target_values
train_nn_result = train_keras_nn(train_values, train_target_values, train_meta_df, 'nn_meta', sequential_nn_model_min, batch_size=512)

<class 'numpy.ndarray'>
Train on 144000 samples, validate on 16000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
val_index.shape:  (16000,)
predictions.shape:  (16000,)
Train on 144000 samples, validate on 16000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
val_index.shape:  (16000,)
predictions.shape:  (16000,)
Train on 144000 samples, validate on 16000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
val_index.shape:  (16000,)
predictions.shape:  (16000,)
Train on 144000 samples, validate on 16000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
val_index.shape:  (16000,)
predictions.shape:  (16000,)
Train on 144000 samples, validate on 16000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
val_index.shape:  (16000,)
predictions.shape:  (16000,)
Run time 3.289361608028412 min


In [36]:
sequential_nn_model_min, train_meta_df = train_nn_result

In [37]:
train_meta_df['nn_meta']

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
5         0.059299
6         0.113107
7         0.403068
8              NaN
9              NaN
10        0.029102
11             NaN
12             NaN
13        0.353431
14        0.174848
15             NaN
16        0.081822
17             NaN
18             NaN
19        0.279812
20             NaN
21             NaN
22        0.158060
23        0.266456
24        0.225823
25        0.121764
26        0.117786
27             NaN
28        0.149639
29             NaN
            ...   
199970         NaN
199971         NaN
199972         NaN
199973         NaN
199974         NaN
199975         NaN
199976         NaN
199977         NaN
199978         NaN
199979         NaN
199980         NaN
199981         NaN
199982         NaN
199983         NaN
199984         NaN
199985         NaN
199986         NaN
199987         NaN
199988         NaN
199989         NaN
199990         NaN
199991      

In [38]:
uniques_dict = {column_name: train_df[column_name].unique() for column_name in train_df.columns.drop(['ID_code','target']).tolist()}

In [39]:
uniques_dict_counts = {column_name: uniques.shape[0] for column_name, uniques in uniques_dict.items()}

In [40]:
uniques_counts_series = pd.Series(uniques_dict_counts)

In [41]:
print(uniques_counts_series.unique().shape)

(200,)


In [42]:
print(uniques_counts_series.max())

169968


In [43]:
print(uniques_counts_series.min())

451


In [None]:
lgbm_param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [None]:
'''
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')
'''

In [None]:
min_max_scaler = MinMaxScaler()

In [None]:
#train_df_rows_count = train_df.shape[0]

In [None]:
#train_df_rows_count

In [None]:
#uniques_count_more_1_2 = uniques_counts_series[uniques_counts_series > train_df_rows_count / 2]

In [None]:
#uniques_count_more_1_2.shape

In [None]:
#uniques_count_more_1_4_less_1_2 = uniques_counts_series[uniques_counts_series < train_df_rows_count / 2]

In [None]:
#uniques_count_less_1_2_more_1_4 = uniques_count_more_1_4_less_1_2[uniques_count_more_1_4_less_1_2 > train_df_rows_count / 4]

In [None]:
#uniques_count_more_1_4_less_1_2.shape

In [None]:
#uniques_count_less_1_4 = uniques_counts_series[uniques_counts_series < train_df_rows_count / 4]

In [None]:
#uniques_count_less_1_4.shape

In [None]:
train_results_whole = train_lgbm(
    train_values,
    test_df,
    train_target_values,
    train_df.columns.drop(['ID_code', 'target']).tolist(),
    lgbm_param
)

In [None]:
#oof_ucm_whole, predictions_whole, feature_importance_whole, clf_ucm_whole = train_results_whole

In [None]:
#predictions_df_whole = pd.DataFrame(data=predictions_whole, columns=['target'])

In [None]:
#submission_whole_only_df = pd.DataFrame({'ID_code': ID_code, 'target': predictions_df_whole['target'].values.astype('float32')})

In [None]:
#submission_whole_only_df.to_csv('submission_whole_only.csv', index=False)

In [None]:
#secquential_nn_model_min = train_keras_nn(train_polinomial_values_ucm_1_2, train_target_values_ucm_1_2, sequential_nn_model_min_1_2, batch_size=512, epochs=30)

In [None]:
#loss_and_metrics = sequential_nn_model_min.evaluate(holdout_test_polinomial_values_ucm_1_2, holdout_test_target_values_ucm_1_2, batch_size=100)

In [None]:
#sequential_nn_model_min.save('secquential_nn_model_min.txt')

In [None]:
test_id_code = test_df['ID_code']

In [None]:
mlp_submission_df = pd.read_csv('submission_mlp_not_my.csv')

In [None]:
whole_submission_df = pd.read_csv('submission_whole_only.csv')

In [None]:
simply_blend_gb_and_nn_df = simply_blend([mlp_submission_df, whole_submission_df], [1, 1])

In [None]:
submission_df = pd.DataFrame({"ID_code" : test_id_code.values, "target" : simply_blend_gb_and_nn_df['target']})

In [None]:
submission_df.to_csv('simply_blend_gb_and_nn.csv', index=False)

In [None]:
submission_df.head()

In [None]:
train_meta_df['log_reg_meta'] = np.NAN

In [None]:
log_reg_model = LogisticRegression(C=1, n_jobs=10, penalty="l2", solver='lbfgs')
#log_reg_model = LogisticRegression(C=1, n_jobs=10, solver='newton-cg')

In [None]:
train = train_df
target = train_target_series

In [None]:
kfold = KFold(n_splits=5, random_state=2, shuffle=True)
start_time = time.time()
aucs=[]

test_preds = []
# for early stopping
# it takes a long time if using all the samples.
samples = train.shape[0]
for fold,(train_idx, val_idx) in enumerate(kfold.split(train, target)):
    print("####################################")
    print("############fold:", fold)
    sample_x = train.iloc[train_idx].values
    sample_y = target.iloc[train_idx].values
    
    sample_val_x = train.iloc[val_idx].values
    sample_val_y = target.iloc[val_idx].values
    
    log_reg_model.fit(sample_x,sample_y)
    y_pred_prob = log_reg_model.predict_proba(sample_x)[:,1]
    #y_val_pred_prob = model.predict_proba(sample_val_x)[:,1]
    y_val_pred_prob = model.predict_proba(sample_val_x)[:, 0]
    print("type(val_idx): ", type(val_idx))
    print("type(y_val_pred_proba): ", type(y_val_pred_prob))
    print("val_idx.shape: ", val_idx.shape)
    print("y_val_pred_proba.shape: ", y_val_pred_prob.shape)
    print("val_idx: ", val_idx)
    print("y_val_pred_proba: ", y_val_pred_prob)    
    train_meta_df['log_reg_meta'].iloc[val_idx] = y_val_pred_prob
   
    train_auc = metrics.roc_auc_score(sample_y,y_pred_prob)
    val_auc = metrics.roc_auc_score(sample_val_y,y_val_pred_prob)
    print("train auc:{},val auc:{}".format(train_auc,val_auc))
    aucs.append([train_auc,val_auc])
    #test_preds.append(model.predict_proba(test)[:,1])
    
end_time=time.time()
val_aucs=[auc[1] for auc in aucs]
print("using {} samples,total time:{}s,mean val auc:{}".format(samples,end_time-start_time,np.mean(val_aucs)))

In [None]:
train_meta_df['log_reg_meta']

In [None]:
(train_meta_df['log_reg_meta'].isna() == False).shape

In [None]:
'''
feat1, feat2 = 'var_81', 'var_139'

fig = plt.subplots(figsize=(15, 5))

#plot pdf feat 1
bin_pdf_values, x, log_pdf_0, log_pdf_1 = calculate_pdf_difference(feat = feat1, df_feature = full, df_target = train_df, IQR_multiplier = 2, bin_bandwidth_multiplier = 1.5, print_number_bins = True)

plt.subplot(1, 2, 1)

sns.kdeplot(train_df[feat1][train_df['target'] == 0], shade=False, color="b", label = 'target = 0')
sns.kdeplot(train_df[feat1][train_df['target'] == 1], shade=False, color="r", label = 'target = 1')
plt.plot(x, log_pdf_0)
plt.plot(x, log_pdf_1) 
plt.title(feat1)
plt.xlabel('Feature Values')
plt.ylabel('Probability')

#plot pdf feat 2
bin_pdf_values, x, log_pdf_0, log_pdf_1 = calculate_pdf_difference(feat = feat2, df_feature = full, df_target = train_df, IQR_multiplier = 2, bin_bandwidth_multiplier = 1.5, print_number_bins = True)

plt.subplot(1, 2, 2)
sns.kdeplot(train_df[feat2][train_df['target'] == 0], shade=False, color="b", label = 'target = 0')
sns.kdeplot(train_df[feat2][train_df['target'] == 1], shade=False, color="r", label = 'target = 1')
plt.plot(x, log_pdf_0)
plt.plot(x, log_pdf_1) 
plt.title(feat2)
plt.xlabel('Feature Values')
plt.ylabel('Probability')

plt.show()
'''