In [5]:
import pandas as pd 
import numpy as np 
import os , sys
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def sep_col(df):
    ca_cols = []
    int_cols = []
    float_cols = []
    other_cols = []
    for col in df.columns:
        if df[col].dtype in ['category', 'object']:
            ca_cols.append(col)
        elif df[col].dtype in ['int8', 'int16', 'int32']:
            int_cols.append(col)
        elif df[col].dtype in ['float16', 'float32', 'float64']:
            float_cols.append(col)
        else:
            other_cols.append(col)
    return ca_cols, int_cols, float_cols, other_cols


def get_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file)
    df = reduce_mem_usage(df)
    return df

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
tar_dir = '../../.kaggle/competitions/home-credit-default-risk/'

In [2]:
os.listdir(tar_dir)

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'sample_submission.csv']

In [6]:
df = get_data(tar_dir+'installments_payments.csv')


Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%


In [9]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.359863,6948.359863
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525024,1716.525024
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130859,24350.130859
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040039,2160.584961


In [10]:
df.describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
count,13605400.0,13605400.0,13605401.0,13605400.0,13605401.0,13602496.0,13605400.0,13602500.0
mean,1903365.0,278444.9,,18.8709,,,16750.76,16915.04
std,536202.9,102718.3,0.0,26.66407,,,49642.95,53759.81
min,1000001.0,100001.0,0.0,1.0,-2922.0,-4920.0,0.0,0.0
25%,1434191.0,189639.0,0.0,4.0,-1654.0,-1662.0,4226.085,3398.265
50%,1896520.0,278685.0,1.0,8.0,-818.0,-827.0,8884.08,8125.515
75%,2369094.0,367530.0,1.0,19.0,-361.0,-370.0,16710.21,16108.42
max,2843499.0,456255.0,178.0,277.0,-1.0,-1.0,3771488.0,3771488.0


In [31]:
temp = df.groupby(['SK_ID_CURR', 'SK_ID_PREV'])


In [33]:
temp[['NUM_INSTALMENT_NUMBER', 'NUM_INSTALMENT_VERSION'] ].agg(['mean', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_VERSION
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,max
SK_ID_CURR,SK_ID_PREV,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
100001,1369693,2.500000,4,1.250000,2.0
100001,1851984,3.000000,4,1.000000,1.0
100002,1038818,10.000000,19,1.052734,2.0
100003,1810518,4.000000,7,1.142578,2.0
100003,2396755,6.500000,12,1.000000,1.0
100003,2636178,3.500000,6,1.000000,1.0
100004,1564014,2.000000,3,1.333008,2.0
100005,2495675,5.000000,9,1.111328,2.0
100006,2078043,1.000000,1,2.000000,2.0
100006,2190416,5.500000,10,1.000000,1.0


In [30]:
df[(df['SK_ID_CURR']==100002      ) & (df['SK_ID_PREV']==1038818       )].sort_values('NUM_INSTALMENT_NUMBER')

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775391,9251.775391
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775391,9251.775391
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775391,9251.775391
3343696,1038818,100002,1.0,4,-475.0,-498.0,9251.775391,9251.775391
2841063,1038818,100002,1.0,5,-445.0,-468.0,9251.775391,9251.775391
1807424,1038818,100002,1.0,6,-415.0,-446.0,9251.775391,9251.775391
3594829,1038818,100002,1.0,7,-385.0,-412.0,9251.775391,9251.775391
210205,1038818,100002,1.0,8,-355.0,-375.0,9251.775391,9251.775391
607863,1038818,100002,1.0,9,-325.0,-344.0,9251.775391,9251.775391
2646927,1038818,100002,1.0,10,-295.0,-312.0,9251.775391,9251.775391


In [13]:
df_credit = get_data(tar_dir+'credit_card_balance.csv')

Memory usage of dataframe is 673.88 MB
Memory usage after optimization is: 263.69 MB
Decreased by 60.9%


In [57]:
import plotly.graph_objs as go

res = plot([go.Scatter(x=[1, 2, 3], y=[3, 1, 6]), 
            go.Histogram(x=[1, 2, 3], y=[3, 1, 6]), 
            go.Bar(x=[1, 2, 3], y=[3, 1, 6])])

SyntaxError: unexpected EOF while parsing (<ipython-input-57-6c03aa53dd1a>, line 5)

'file://C:\\Users\\kent\\Documents\\GitHub\\temp-plot.html'

In [55]:
from IPython.display import HTML
HTML('<iframe src={} width=100% height=400></iframe>'.format('temp-plot.html'))
#IFrame(res,width=700, height=350)

In [None]:
train = get