In [1]:
import os
cwd = os.getcwd()
data_dir = cwd + '/dataset'
os.listdir(data_dir)

['application_test.csv',
 '.DS_Store',
 'HomeCredit_columns_description.csv',
 'POS_CASH_balance.csv',
 'credit_card_balance.csv',
 'installments_payments.csv',
 'application_train.csv',
 'bureau.csv',
 'previous_application.csv',
 'HomeCredit_columns_description.xls',
 'bureau_balance.csv',
 'sample_submission.csv']

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

%matplotlib inline
pd.set_option('max_columns',500)
pd.set_option('max_rows',500)

In [3]:
data = pd.read_csv(data_dir + "/application_train.csv")

In [None]:
data.head(20)

In [None]:
#null data clean
#print(appli_train.isnull().any())
nan_tmp = data.isnull().any()
#print(nan_tmp.loc[nan_tmp==True].index.values[0])
#get the null columns into the drop list
nan_drop_list = list(nan_tmp.loc[nan_tmp==True].index.values)
print(len(nan_drop_list))

In [None]:
clean_data_list = data.columns.drop(nan_drop_list)
clean_data = data[clean_data_list]

In [None]:
nan_drop_list.insert(0,"TARGET")
nan_drop_list.insert(0,"SK_ID_CURR")
nan_data = data[nan_drop_list]

In [None]:
nan_data.head(5)

In [None]:
clean_data.head(20)

In [None]:
del data

In [None]:
a = clean_data[['NAME_CONTRACT_TYPE']].groupby("NAME_CONTRACT_TYPE").groups.keys()
list(a)

In [None]:
def draw_graph(x0,x1,x,y,names=['default1','default2'], features=['default1','default2']):
    """Use to generate a stacked histogram of x0 & x1 value counts, mixed with a scatter of percentage.
        x0: trace 1 x value,list()
        x1: trace 2 x value,list()
        x: trace 3 x value, for percentage,list()
        y: trace 3 y value, for percentage,list()
        names: the names of x0, x1 stand for,list(),len=2
        features: The features we compared,list(),len=2
    """
    trace1 = go.Histogram(
        x=x0,
        histnorm='count',
        name=names[0],
        opacity=0.75,
        yaxis='y2'
    )
    trace2 = go.Histogram(
        x=x1,
        name=names[1],
        opacity=0.75,       
        yaxis='y2'
    )
    trace3 = go.Scatter(
        x=x,
        y=y,
        name='Percentage'
 
    )
    data = [trace1, trace2, trace3]

    layout = go.Layout(
        title=features[0]+' .vs '+features[1],
        xaxis=dict(
            title='Value',
            categoryorder = "category ascending"
        ),
        yaxis2=dict(
            title='Count',
            overlaying='y',
            side='right'
        ),
        yaxis=dict(
            title='Percentage',
            range=[0,np.max([np.max(y)+0.01,0.1])]
        ),
        bargap=0.2,
        bargroupgap=0.1,
        barmode='stack'
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='styled histogram')
    
def gen_graph(features):
    '''
        use to generate the parameters to draw the graph
        fetaures: stands for compared 2 features,list(),len=2
    '''
    corr = clean_data[features]
    #bar graph
    x0 = corr.loc[corr[features[0]]==0,features[1]].values
    x1 = corr.loc[corr[features[0]]==1,features[1]].values
    names = [features[0]+'=0',features[0]+'=1']
    #scatter graph, for pecentage
    data_type = clean_data[[features[1]]].groupby(features[1]).groups.keys()
    x = list(data_type)
    y = []
    for a in data_type:
        y.append(corr.loc[corr[features[1]]==a,features[0]].mean())
    ind = np.lexsort((y,x))
    x = [x[i] for i in ind]
    y = [y[i] for i in ind]
    draw_graph(x0,x1,x,y,names, features)

## 特征分析

** 'TARGET' vs 'NAME_CONTRACT_TYPE' **

In [None]:
features = ['TARGET','NAME_CONTRACT_TYPE']
gen_graph(features)
#draw_graph(x0,x1,x,y,names, features)

** 'TARGET' vs 'CODE_GENDER'**

In [None]:
features = ['TARGET','CODE_GENDER']
gen_graph(features)
#draw_graph(x0,x1,x,y,names, features)

In [None]:
clean_data.loc[clean_data['CODE_GENDER']=='XNA'].head(5)

** 'TARGET' vs "FLAG_OWN_CAR" ** 

In [None]:
features = ['TARGET','FLAG_OWN_CAR']
gen_graph(features)
#draw_graph(x0,x1,x,y,names, features)

** 'TARGET' vs "FLAG_OWN_REALTY" ** 

In [None]:
features = ['TARGET','FLAG_OWN_REALTY']
gen_graph(features)
#draw_graph(x0,x1,x,y,names, features)

*** 'TARGET' vs "CNT_CHILDREN" ***

In [None]:
features = ['TARGET','CNT_CHILDREN']
corr = clean_data[features]
#corr[features[1]] = pd.qcut(corr[features[1]],[0.5,1.5],labels=['None','One','Multi'])
corr.loc[corr[features[1]]>1, features[1]] = 'Multi'
corr.loc[corr[features[1]]==0, features[1]] = 'Zero'
corr.loc[corr[features[1]]==1, features[1]] = 'One'
 #bar graph
x0 = corr.loc[corr[features[0]]==0,features[1]].values
x1 = corr.loc[corr[features[0]]==1,features[1]].values
names = [features[0]+'=0',features[0]+'=1']
#scatter graph, for pecentage
data_type = corr[[features[1]]].groupby(features[1]).groups.keys()
x = list(data_type)
y = []
for a in data_type:
    y.append(corr.loc[corr[features[1]]==a,features[0]].mean())
ind = np.lexsort((y,x))
x = [x[i] for i in ind]
y = [y[i] for i in ind]
draw_graph(x0,x1,x,y,names, features)

** 'TARGET' vs "AMT_INCOME_TOTAL" **

In [None]:
features = ['TARGET','AMT_INCOME_TOTAL']
corr = clean_data[features]
corr[features[1]] = pd.qcut(corr[features[1]],3,labels=['low','median','high'])
 #bar graph
x0 = corr.loc[corr[features[0]]==0,features[1]].values
x1 = corr.loc[corr[features[0]]==1,features[1]].values
names = [features[0]+'=0',features[0]+'=1']
#scatter graph, for pecentage
data_type = corr[[features[1]]].groupby(features[1]).groups.keys()
x = list(data_type)
y = []
for a in data_type:
    y.append(corr.loc[corr[features[1]]==a,features[0]].mean())
ind = np.lexsort((y,x))
x = [x[i] for i in ind]
y = [y[i] for i in ind]
draw_graph(x0,x1,x,y,names, features)

** 'TARGET' vs "AMT_CREDIT" ** 

In [None]:
features = ['TARGET','AMT_CREDIT']
corr = clean_data[features]
corr[features[1]] = pd.qcut(corr[features[1]],3,labels=['low','median','high'])
 #bar graph
x0 = corr.loc[corr[features[0]]==0,features[1]].values
x1 = corr.loc[corr[features[0]]==1,features[1]].values
names = [features[0]+'=0',features[0]+'=1']
#scatter graph, for pecentage
data_type = corr[[features[1]]].groupby(features[1]).groups.keys()
x = list(data_type)
y = []
for a in data_type:
    y.append(corr.loc[corr[features[1]]==a,features[0]].mean())
ind = np.lexsort((y,x))
x = [x[i] for i in ind]
y = [y[i] for i in ind]
draw_graph(x0,x1,x,y,names, features)

In [None]:
features = ['TARGET','NAME_INCOME_TYPE']
gen_graph(features)

** TARGET vs NAME_EDUCATION_TYPE **

In [None]:
features = ['TARGET','NAME_EDUCATION_TYPE']
gen_graph(features)

** TARGET vs NAME_FAMILY_STATUS **

In [None]:
features = ['TARGET','NAME_FAMILY_STATUS']
gen_graph(features)

**TARGET vs NAME_HOUSING_TYPE ** 

In [None]:
features = ['TARGET','NAME_HOUSING_TYPE']
gen_graph(features)

** TARGET vs DAYS_BIRTH **

In [None]:
corr.head(5)

In [None]:
temp = [[x[i],y[i]] for i in range(len(x))]
tmp = np.array(temp)
tmp

In [None]:
ind = np.lexsort((y,x))
y = [y[i] for i in ind]
x = [x[i] for i in ind]
x,y

In [None]:
np.max([np.max(y),0.1])

In [None]:
'''clean_columns = clean_data.columns.drop(["SK_ID_CURR","TARGET"]).values
for column in clean_columns:
    #print(column)
    if type(clean_data[column].values[0]) == type('a'):
        data_tmp = clean_data[[column, 'TARGET']]
        value_keys= data_tmp.groupby(column).groups.keys()
        for key in value_keys:
            x = data_tmp.loc[data_tmp[column]==key, 'TARGET']'''


In [8]:
data.loc[data["SK_ID_CURR"]==215354]

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
99355,215354,0,Cash loans,F,Y,N,0,270000.0,720000.0,21181.5,720000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.028663,-17520,-939,-1203.0,-1064,4.0,1,1,0,1,0,0,Accountants,2.0,2,2,MONDAY,13,0,0,0,0,0,0,Trade: type 7,0.873736,0.522745,0.231439,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,-1553.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
