In [1]:
# coding: utf-8

import pandas as pd
import numpy as np
from over_sample import smote


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def reduce():
    print('reduce memory usage...'.center(50, '*'))
    train, test, y, val_x, val_y, features = smote()
    train = pd.DataFrame(train)
    val_x = pd.DataFrame(val_x)
    test = pd.DataFrame(test)
    train = reduce_mem_usage(train)
    val_x = reduce_mem_usage(val_x)
    test = reduce_mem_usage(test)
    feats = [f_ for f_ in features if f_ not in ['SK_ID_CURR']]
    y = pd.DataFrame(y)
    val_y = pd.DataFrame(val_y)
    train.columns = features
    train = train[feats]
    test.columns = features
    train.to_csv('../data/train_.csv', index = False)
    y.to_csv('../data/y_.csv', index = False)
    test.to_csv('../data/test_.csv', index = False)
    train.to_csv('../data/train_.csv', index = False)
    val_x.to_csv('../data/val_x_.csv', index = False)
    val_y.to_csv('../data/val_y_.csv', index = False)
    return train, test, y,val_x, val_y, features, feats




In [2]:
train, test, y,val_x, val_y, features, feats = reduce()

**************reduce memory usage...**************
********************smote ...*********************
*****************NaN process ...******************
****************feature combine...****************
******************merge data ...******************
bureau_balance shape: (27299925, 3)
bureau shape: (1716428, 17)
previous_application shape: (1670214, 37)
pos_cash_balance shape: (10001358, 8)
credit_card_balance shape: (3840312, 23)
installment_payment shape: (13605401, 8)
train shape: (307511, 122)
test shape: (48744, 121)
(307511, 389) (48744, 389)
Memory usage of dataframe is 823.49 MB
Memory usage after optimization is: 212.73 MB
Decreased by 74.2%
Memory usage of dataframe is 91.50 MB
Memory usage after optimization is: 23.64 MB
Decreased by 74.2%
Memory usage of dataframe is 145.04 MB
Memory usage after optimization is: 37.47 MB
Decreased by 74.2%


In [15]:
train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,0.0,1.0,0.0,0.0,0.0,112500.0,495000.0,21933.0,495000.0,0.0,...,37261.109375,1.194336,0.836914,1.369141,0.730469,1.146484,0.872559,0.522461,0.443359,0.387939
1,0.0,1.0,1.0,0.0,0.0,213750.0,3060000.0,77485.5,3060000.0,0.0,...,4135.241211,0.73584,1.358398,1.291016,0.774414,1.754883,0.569824,0.416504,0.552734,0.320312
2,0.0,1.0,0.0,0.0,0.0,450000.0,1417495.5,136687.5,1363500.0,0.0,...,33406.464844,1.487305,0.672363,1.40918,0.709961,0.947266,1.055664,0.604492,0.422363,0.444824
3,0.0,0.0,0.0,0.0,0.0,112500.0,467257.5,21910.5,328500.0,0.0,...,5104.342285,-1.522461,-0.656738,-1.567383,-0.637695,1.029297,0.971191,-1.0,0.399414,0.387939
4,0.0,0.0,0.0,1.0,0.0,157500.0,848745.0,36090.0,675000.0,0.0,...,9121.095703,-1.067383,-0.936523,-1.820312,-0.549316,1.705078,0.586426,-1.0,0.555664,0.33252


In [16]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,100001,0,1,0,0,0,135000.0,568800.0,20560.5,450000.0,...,5885.132324,0.958496,1.042969,3.285156,0.304443,3.427734,0.291748,0.561035,0.582031,0.147949
1,100005,0,0,0,0,0,99000.0,222768.0,17370.0,180000.0,...,6240.205078,1.698242,0.588867,1.248047,0.80127,0.734863,1.360352,0.447998,0.255859,0.359863
2,100013,0,0,1,0,0,202500.0,663264.0,69777.0,630000.0,...,9740.235352,-1.125,-0.888672,-1.265625,-0.790039,1.125,0.88916,-1.0,0.530273,0.476807
3,100028,0,1,0,0,2,315000.0,1575000.0,49018.5,1575000.0,...,4356.731445,1.026367,0.974121,0.87793,1.138672,0.855469,1.168945,0.422363,0.411865,0.478027
4,100038,0,0,1,1,1,180000.0,625500.0,32067.0,625500.0,...,11100.337891,0.574707,1.740234,-0.335693,-2.978516,-0.583984,-1.711914,0.184082,0.354736,-1.0


In [17]:
y.head()

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,0


In [18]:
val_x.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
32413,137571,0,1,0,0,0,90000.0,337500.0,20394.0,337500.0,...,10200.322266,-1.148438,-0.871094,1.0,1.0,-0.871094,-1.148438,-1.0,0.520996,-1.0
226702,362592,0,0,1,1,0,216000.0,835380.0,31086.0,675000.0,...,-1.0,-1.296875,-0.770996,-2.800781,-0.356934,2.160156,0.462891,-1.0,0.466309,0.199951
106022,223011,0,1,1,0,3,90000.0,450000.0,20979.0,450000.0,...,13899.285156,-4.507812,-0.221924,1.0,1.0,-0.221924,-4.507812,-1.0,0.095032,-1.0
62854,172904,0,1,0,0,0,247500.0,2290743.0,67108.5,2047500.0,...,71848.914062,-4.050781,-0.246948,-2.957031,-0.338135,0.72998,1.369141,-1.0,0.115356,0.186035
232481,369273,0,0,1,1,0,135000.0,472500.0,44635.5,454500.0,...,15523.151367,1.185547,0.843262,-1.005859,-0.994141,-0.848633,-1.178711,0.59082,0.509277,-1.0


In [19]:
train_ = pd.read_csv('../data/train_.csv')
train_.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,0.0,1.0,0.0,0.0,0.0,112500.0,495000.0,21933.0,495000.0,0.0,...,37261.11,1.194,0.837,1.369,0.7305,1.146,0.8726,0.5225,0.4434,0.388
1,0.0,1.0,1.0,0.0,0.0,213750.0,3060000.0,77485.5,3060000.0,0.0,...,4135.241,0.736,1.358,1.291,0.7744,1.755,0.57,0.4165,0.5527,0.3203
2,0.0,1.0,0.0,0.0,0.0,450000.0,1417495.5,136687.5,1363500.0,0.0,...,33406.465,1.487,0.6724,1.409,0.71,0.9473,1.056,0.6045,0.4224,0.4448
3,0.0,0.0,0.0,0.0,0.0,112500.0,467257.5,21910.5,328500.0,0.0,...,5104.3423,-1.522,-0.6567,-1.567,-0.6377,1.029,0.971,-1.0,0.3994,0.388
4,0.0,0.0,0.0,1.0,0.0,157500.0,848745.0,36090.0,675000.0,0.0,...,9121.096,-1.067,-0.9365,-1.82,-0.5493,1.705,0.5864,-1.0,0.5557,0.3325


In [8]:
train_.head()

Unnamed: 0.1,Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,0,0.0,1.0,0.0,0.0,0.0,112500.0,495000.0,21933.0,495000.0,...,37261.11,1.194,0.837,1.369,0.7305,1.146,0.8726,0.5225,0.4434,0.388
1,1,0.0,1.0,1.0,0.0,0.0,213750.0,3060000.0,77485.5,3060000.0,...,4135.241,0.736,1.358,1.291,0.7744,1.755,0.57,0.4165,0.5527,0.3203
2,2,0.0,1.0,0.0,0.0,0.0,450000.0,1417495.5,136687.5,1363500.0,...,33406.465,1.487,0.6724,1.409,0.71,0.9473,1.056,0.6045,0.4224,0.4448
3,3,0.0,0.0,0.0,0.0,0.0,112500.0,467257.5,21910.5,328500.0,...,5104.3423,-1.522,-0.6567,-1.567,-0.6377,1.029,0.971,-1.0,0.3994,0.388
4,4,0.0,0.0,0.0,1.0,0.0,157500.0,848745.0,36090.0,675000.0,...,9121.096,-1.067,-0.9365,-1.82,-0.5493,1.705,0.5864,-1.0,0.5557,0.3325


In [11]:
sample = pd.DataFrame(np.zeros((4,3)))
sample.to_csv('../data/sample.csv', index = False)

In [12]:
sample = pd.read_csv('../data/sample.csv')
sample.head()

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
