In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
print('train shape:',train.shape)
print('test shape:',test.shape)

In [None]:
def train_model(train_, test_, y_, folds_):
    train_ = pd.DataFrame()
    test_ = pd.DataFrame()
    oof_preds = np.zeros(train_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    
    feature_importance_df = pd.DataFrame()
    feats = [f_ for f_ in train.columns if f_ not in ['SK_ID_CURR']]
    
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(train_)):
        trn_x, trn_y = train_[[feats]].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = train_[[feats]].iloc[val_idx], y_.iloc[val_idx]
        
        clf = LGBMClassifier(
            n_estimators = 4000,
            learning_rate = 0.03,
            num_leaves = 30,
            colsample_bytree = .8,
            subsample = .9,
            max_depth = 7,
            reg_alpha = .1,
            min_split_gain = .01,
            min_child_weight = 2,
            silent = -1,
            verbose = -1
            )
        clf.fit(trn_x, trn_y, 
            eval_set = [(trn_x, trn_y), (val_x, val_y)],
            eval_metric = 'auc', verbose = 100, early_stopping_rounds = 100)
        
        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration = clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = feats
        fold_importance_df['importance'] = clf.feature_importances_
        fold_importance_df['fold'] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('fold %2d AUC %.6f'%(n_fold+1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
    print('full AUC score %.6f'%roc_auc_score(y, oof_preds))
    test_['TARGET'] = sub_preds
    
    return oof_preds, test_[['SK_ID_CURR', 'TARGET']], feature_importance_df

In [None]:
train = train.fillna(-1)
#print(train.isnull().sum())
test = test.fillna(-1)
sm = SMOTE(random_state=42, kind='borderline2')
train, y = sm.fit_sample(train, y)
folds = KFold(n_splits=5, shuffle=True, random_state=0)
oof_preds, test_preds, importances = train_model(train, test, y, folds)
test_preds.to_csv('../data/submission2.csv', index=False)


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# data data files are available in the "../data/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the data directory

import os
print(os.listdir("../data/"))
import gc
# Any results you write to the current directory are saved as output.

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE

def merge_data():
    bur_bal = pd.read_csv('../data/bureau_balance.csv')
    print('bureau_balance shape:', bur_bal.shape)
    #bur_bal.head()
    bur_bal = pd.concat([bur_bal, pd.get_dummies(bur_bal.STATUS, prefix='bur_bal_status')],
                       axis=1).drop('STATUS', axis=1)
    bur_cnts = bur_bal[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
    bur_bal['bur_cnt'] = bur_bal['SK_ID_BUREAU'].map(bur_cnts['MONTHS_BALANCE'])
    avg_bur_bal = bur_bal.groupby('SK_ID_BUREAU').mean()
    avg_bur_bal.columns = ['bur_bal_' + f_ for f_ in avg_bur_bal.columns]
    del bur_bal
    gc.collect()

    bur = pd.read_csv('../data/bureau.csv')
    print('bureau shape:', bur.shape)
    #bur.head()
    bur_credit_active_dum = pd.get_dummies(bur.CREDIT_ACTIVE, prefix='ca')
    bur_credit_currency_dum = pd.get_dummies(bur.CREDIT_CURRENCY, prefix='cc')
    bur_credit_type_dum = pd.get_dummies(bur.CREDIT_TYPE, prefix='ct')

    bur_full = pd.concat([bur, bur_credit_active_dum, bur_credit_currency_dum, bur_credit_type_dum], axis=1).drop(['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'], axis=1)
    del bur_credit_active_dum, bur_credit_currency_dum, bur_credit_type_dum
    gc.collect()
    bur_full = bur_full.merge(right=avg_bur_bal.reset_index(), how='left', on='SK_ID_BUREAU',suffixes=('', '_bur_bal'))
    nb_bureau_per_curr = bur_full[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
    bur_full['SK_ID_BUREAU'] = bur_full['SK_ID_CURR'].map(nb_bureau_per_curr['SK_ID_BUREAU'])
    avg_bur = bur_full.groupby('SK_ID_CURR').mean()
    avg_bur.columns = ['bur_' + f_ for f_ in avg_bur.columns]
    del bur, bur_full, avg_bur_bal
    gc.collect()

    prev = pd.read_csv('../data/previous_application.csv')
    print('previous_application shape:', prev.shape)
    #prev.head()
    prev_cat_features = [f_ for f_ in prev.columns if prev[f_].dtype == 'object']
    prev_dum = pd.DataFrame()
    for f_ in prev_cat_features:
        prev_dum = pd.concat([prev_dum, pd.get_dummies(prev[f_], prefix=f_)], axis=1)
    prev = pd.concat([prev, prev_dum],axis=1)
    del prev_dum
    gc.collect()
    nb_prev_per_curr = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    prev['SK_ID_PREV'] = prev['SK_ID_CURR'].map(nb_prev_per_curr['SK_ID_PREV'])
    avg_prev = prev.groupby('SK_ID_CURR').mean()
    avg_prev.columns = ['prev_' + f_ for f_ in avg_prev.columns]
    del prev
    gc.collect()

    pos = pd.read_csv('../data/POS_CASH_balance.csv')
    print('pos_cash_balance shape:', pos.shape)
    #pos.head()
    pos = pd.concat([pos, pd.get_dummies(pos['NAME_CONTRACT_STATUS'], prefix='ncs')], axis=1)
    nb_prevs = pos[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    pos['SK_ID_PREV'] = pos['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])
    avg_pos = pos.groupby('SK_ID_CURR').mean()
    avg_pos.columns = ['pos_' + f_ for f_ in avg_pos.columns]
    del pos, nb_prevs
    gc.collect()

    cc_bal = pd.read_csv('../data/credit_card_balance.csv')
    print('credit_card_balance shape:', cc_bal.shape)
    cc_bal = pd.concat([cc_bal, pd.get_dummies(cc_bal['NAME_CONTRACT_STATUS'], prefix='ncs')], axis=1)
    nb_prevs = cc_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    cc_bal['SK_ID_PREV'] = cc_bal['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])
    avg_cc_bal = cc_bal.groupby('SK_ID_CURR').mean()
    avg_cc_bal.columns = ['cc_bal_' + f_ for f_ in avg_cc_bal.columns]
    del cc_bal, nb_prevs
    gc.collect()

    inst = pd.read_csv('../data/installments_payments.csv')
    print('installment_payment shape:', inst.shape)
    nb_prevs = inst[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    inst['SK_ID_PREV'] = inst['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])
    avg_inst = inst.groupby('SK_ID_CURR').mean()
    avg_inst.columns = ['inst_' + f_ for f_ in avg_inst.columns]
    del inst, nb_prevs
    gc.collect()

    train = pd.read_csv('../data/application_train.csv')
    test = pd.read_csv('../data/application_test.csv')
    print('train shape:', train.shape)
    print('test shape:', test.shape)
    y = train['TARGET']
    del train['TARGET']
    cat_feats = [f_ for f_ in train.columns if train[f_].dtype == 'object']
    for f_ in cat_feats:
        train[f_], indexer = pd.factorize(train[f_])#类似于类似于类似于label encoder
        test[f_] = indexer.get_indexer(test[f_])
    train = train.merge(right = avg_bur.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right = avg_bur.reset_index(), how='left', on='SK_ID_CURR')
    train = train.merge(right = avg_prev.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right = avg_prev.reset_index(), how='left', on='SK_ID_CURR')
    train = train.merge(right = avg_pos.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right = avg_pos.reset_index(), how='left', on='SK_ID_CURR')
    train = train.merge(right = avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right = avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')
    train = train.merge(right = avg_inst.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right = avg_inst.reset_index(), how='left', on='SK_ID_CURR')
    del avg_bur, avg_prev, avg_pos, avg_cc_bal, avg_inst
    gc.collect()
    return train, test, y
    
def train_model(train_, test_, y_, folds_,feats_,features_):
    train_ = pd.DataFrame(train_)
    test_ = pd.DataFrame(test_)
    print(train_.shape)
    print(test_.shape)
    train_.columns = feats_
    test_.columns = features_
    oof_preds = np.zeros(train_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    
    feature_importance_df = pd.DataFrame()
    #feats = [f_ for f_ in train_.columns if f_ not in ['SK_ID_CURR']]
    
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(train_)):
        #print(train_.type)
        trn_x, trn_y = pd.DataFrame(train_).iloc[trn_idx], pd.DataFrame(y_).iloc[trn_idx]
        val_x, val_y = pd.DataFrame(train_).iloc[val_idx], pd.DataFrame(y_).iloc[val_idx]
        
        clf = LGBMClassifier(
            n_estimators = 4000,
            learning_rate = 0.03,
            num_leaves = 30,
            colsample_bytree = .8,
            subsample = .9,
            max_depth = 7,
            reg_alpha = .1,
            min_split_gain = .01,
            min_child_weight = 2,
            silent = -1,
            verbose = -1
            )
        clf.fit(trn_x, trn_y, 
            eval_set = [(trn_x, trn_y), (val_x, val_y)],
            eval_metric = 'auc', verbose = 100, early_stopping_rounds = 100)
        
        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration = clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(pd.DataFrame(test_[feats_]), num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        feature_importance_df = 0
        #fold_importance_df = pd.DataFrame()
        #fold_importance_df['feature'] = feats
        #fold_importance_df['importance'] = clf.feature_importances_
        #fold_importance_df['fold'] = n_fold + 1
        #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('fold %2d AUC %.6f'%(n_fold+1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
    print('full AUC score %.6f'%roc_auc_score(y, oof_preds))
    test_['TARGET'] = sub_preds
    
    return oof_preds, test_[['SK_ID_CURR', 'TARGET']], feature_importance_df
    
def display_importance(feature_importance_df_, num):
    cols = feature_importance_df_[['feature', 'importance']].groupby('feature').mean().sort_values(by='importance', ascending=False)[:num].index
    best_features = feature_importance_df_[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8, 10))
    sns.barplot(x='importance', y='feature',
        data = best_features.sort_values(by = 'importance', ascending=False))
    plt.title('LightGBM Feature(average over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')
    
    return cols
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
    
def feat_ext_source(df):
    x1 = df['EXT_SOURCE_1'].fillna(-1) + 1e-1
    x2 = df['EXT_SOURCE_2'].fillna(-1) + 1e-1
    x3 = df['EXT_SOURCE_3'].fillna(-1) + 1e-1
    
    df['EXT_SOURCE_1over2_NAminus1_Add0.1'] = x1/x2
    df['EXT_SOURCE_2over1_NAminus1_Add0.1'] = x2/x1
    df['EXT_SOURCE_1over3_NAminus1_Add0.1'] = x1/x3
    df['EXT_SOURCE_3over1_NAminus1_Add0.1'] = x3/x1
    df['EXT_SOURCE_2over3_NAminus1_Add0.1'] = x2/x3
    df['EXT_SOURCE_3over2_NAminus1_Add0.1'] = x3/x2
    df['EXT_SOURCE_1_log'] = np.log(df['EXT_SOURCE_1'] + 1)
    df['EXT_SOURCE_2_log'] = np.log(df['EXT_SOURCE_2'] + 1)
    df['EXT_SOURCE_3_log'] = np.log(df['EXT_SOURCE_3'] + 1) 
    return df


['.ipynb_checkpoints', 'application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'readme.md', 'sample_submission.csv', 'test.csv', 'train.csv']


In [2]:
gc.enable()
train, test, y = merge_data()
train = feat_ext_source(train)
test = feat_ext_source(test)
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
#train.to_csv('train.csv', index=False)
#test.to_csv('test.csv', index=False)
#train.isnull().sum()
train = train.fillna(-1)
#print(train.isnull().sum())
test = test.fillna(-1)


bureau_balance shape: (27299925, 3)
bureau shape: (1716428, 17)
previous_application shape: (1670214, 37)
pos_cash_balance shape: (10001358, 8)
credit_card_balance shape: (3840312, 23)
installment_payment shape: (13605401, 8)
train shape: (307511, 122)
test shape: (48744, 121)
Memory usage of dataframe is 893.87 MB
Memory usage after optimization is: 231.09 MB
Decreased by 74.1%
Memory usage of dataframe is 141.69 MB
Memory usage after optimization is: 36.63 MB
Decreased by 74.1%


In [3]:
print(train.shape, test.shape)

(307511, 389) (48744, 389)


In [4]:
feats = [f_ for f_ in train.columns if f_ not in ['SK_ID_CURR']]
features = train.columns
train = train[feats]
#test = test[feats]

In [5]:
print(train.shape, test.shape)

(307511, 388) (48744, 389)


In [6]:
sm = SMOTE(random_state=42, kind='borderline2')
train, y = sm.fit_sample(train, y)
folds = KFold(n_splits=5, shuffle=True, random_state=0)

In [7]:
oof_preds, test_preds, importances = train_model(train, test, y, folds,feats, features)
test_preds.to_csv('../data/submission2.csv', index=False)

(565372, 388)
(48744, 389)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.977122	valid_1's auc: 0.97665
[200]	training's auc: 0.979736	valid_1's auc: 0.978764
[300]	training's auc: 0.981421	valid_1's auc: 0.979774
[400]	training's auc: 0.982649	valid_1's auc: 0.980254
[500]	training's auc: 0.983691	valid_1's auc: 0.980523
[600]	training's auc: 0.984652	valid_1's auc: 0.98067
[700]	training's auc: 0.985553	valid_1's auc: 0.980715
[800]	training's auc: 0.9864	valid_1's auc: 0.980766
[900]	training's auc: 0.987195	valid_1's auc: 0.980778
[1000]	training's auc: 0.987952	valid_1's auc: 0.980784
Early stopping, best iteration is:
[963]	training's auc: 0.987673	valid_1's auc: 0.980793
fold  1 AUC 0.980793


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.977124	valid_1's auc: 0.976613
[200]	training's auc: 0.979692	valid_1's auc: 0.978754
[300]	training's auc: 0.981358	valid_1's auc: 0.979812
[400]	training's auc: 0.982583	valid_1's auc: 0.980373
[500]	training's auc: 0.983642	valid_1's auc: 0.980675
[600]	training's auc: 0.984601	valid_1's auc: 0.980874
[700]	training's auc: 0.985482	valid_1's auc: 0.981016
[800]	training's auc: 0.986343	valid_1's auc: 0.981083
[900]	training's auc: 0.987143	valid_1's auc: 0.981125
[1000]	training's auc: 0.987865	valid_1's auc: 0.981159
[1100]	training's auc: 0.988568	valid_1's auc: 0.981177
[1200]	training's auc: 0.989211	valid_1's auc: 0.981209
[1300]	training's auc: 0.989831	valid_1's auc: 0.981231
Early stopping, best iteration is:
[1268]	training's auc: 0.989639	valid_1's auc: 0.981234
fold  2 AUC 0.981234


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.977122	valid_1's auc: 0.976519
[200]	training's auc: 0.979703	valid_1's auc: 0.978621
[300]	training's auc: 0.981355	valid_1's auc: 0.979701
[400]	training's auc: 0.982612	valid_1's auc: 0.980252
[500]	training's auc: 0.98368	valid_1's auc: 0.98052
[600]	training's auc: 0.984647	valid_1's auc: 0.98067
[700]	training's auc: 0.985565	valid_1's auc: 0.980761
[800]	training's auc: 0.986399	valid_1's auc: 0.980824
[900]	training's auc: 0.987176	valid_1's auc: 0.980871
[1000]	training's auc: 0.987908	valid_1's auc: 0.980889
[1100]	training's auc: 0.988578	valid_1's auc: 0.980924
[1200]	training's auc: 0.989201	valid_1's auc: 0.980923
Early stopping, best iteration is:
[1145]	training's auc: 0.98887	valid_1's auc: 0.980938
fold  3 AUC 0.980938


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.976968	valid_1's auc: 0.977021
[200]	training's auc: 0.979635	valid_1's auc: 0.979267
[300]	training's auc: 0.98133	valid_1's auc: 0.98026
[400]	training's auc: 0.982555	valid_1's auc: 0.980751
[500]	training's auc: 0.983619	valid_1's auc: 0.981021
[600]	training's auc: 0.984589	valid_1's auc: 0.981166
[700]	training's auc: 0.985486	valid_1's auc: 0.981237
[800]	training's auc: 0.986321	valid_1's auc: 0.981276
[900]	training's auc: 0.987132	valid_1's auc: 0.981295
[1000]	training's auc: 0.987896	valid_1's auc: 0.981315
[1100]	training's auc: 0.988608	valid_1's auc: 0.981324
[1200]	training's auc: 0.989298	valid_1's auc: 0.981322
Early stopping, best iteration is:
[1107]	training's auc: 0.98866	valid_1's auc: 0.981327
fold  4 AUC 0.981327


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.977157	valid_1's auc: 0.976387
[200]	training's auc: 0.979773	valid_1's auc: 0.978476
[300]	training's auc: 0.981424	valid_1's auc: 0.97948
[400]	training's auc: 0.982672	valid_1's auc: 0.979985
[500]	training's auc: 0.983728	valid_1's auc: 0.980248
[600]	training's auc: 0.984682	valid_1's auc: 0.980429
[700]	training's auc: 0.985629	valid_1's auc: 0.980522
[800]	training's auc: 0.98647	valid_1's auc: 0.980606
[900]	training's auc: 0.987264	valid_1's auc: 0.980665
[1000]	training's auc: 0.98801	valid_1's auc: 0.980693
[1100]	training's auc: 0.988705	valid_1's auc: 0.980715
[1200]	training's auc: 0.989335	valid_1's auc: 0.980733
[1300]	training's auc: 0.989948	valid_1's auc: 0.98074
[1400]	training's auc: 0.990537	valid_1's auc: 0.980767
[1500]	training's auc: 0.991076	valid_1's auc: 0.980771
[1600]	training's auc: 0.991586	valid_1's auc: 0.980777
Early stopping, best iteration is:
[1544]	training's a

In [None]:
test_org = pd.read_csv('../data/test.csv')
test_org.head()

In [None]:
test_pred['SK_ID_CURR'] = test_org['SK_ID_CURR']
test_pred['TARGET'] = test_['TARGET']

In [None]:
train.head()

In [None]:
nan_count = pd.DataFrame()

In [None]:
nan_count['features'] = train.columns
nan_count['nan_nums'] = list(train.isnull().sum())
nan_count.head(100)

In [None]:
#train.isnull().sum()
print(nan_count[nan_count['nan_nums'] > 155000].shape)
nan_count[nan_count['nan_nums'] > 155000]

In [None]:
train.fillna(-1)

In [None]:
import numpy as np
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                    [3, 4, np.nan, 1],
                    [np.nan, np.nan, np.nan, 5],
                    [np.nan, 3, np.nan, 4]],
                    columns=list('ABCD'))
df.isnull().sum()

In [None]:
df.fillna(-1)

In [None]:
df