In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## Read in Data

In [None]:
train = pd.read_csv("../input/application_train.csv")
test = pd.read_csv("../input/application_test.csv")

## Peak at Data

In [None]:
train.head()

In [None]:
test.head()

## Check Missing Values

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
# Missing values statistics
missing_train = missing_values_table(train)
missing_train.head(100)

# Data Types

In [None]:
# Number of each type of column
train.dtypes.value_counts()

In [None]:
# Number of unique classes in each object column
train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

# Descriptive Statistics (To detect outliers and anomolies in continuous variables)

## Age

In [None]:
(train['DAYS_BIRTH'] / -365).describe()

## Amount of time at current job

In [None]:
train['DAYS_EMPLOYED'].describe()

## Number of Children

In [None]:
train['CNT_CHILDREN'].describe()

## Income

In [None]:
train['AMT_INCOME_TOTAL'].describe()

## Number of Days before application that client changed his/her registration

In [None]:
train['DAYS_REGISTRATION'].describe()

# Distributions

## Target (repaid loan or not)

In [None]:
# TARGET value 0 means loan is repayed, value 1 means loan is not repayed.
plt.figure(figsize=(15,5))
sns.countplot(train.TARGET)
plt.xlabel('Target (0 = repaid, 1 = not repaid)'); plt.ylabel('C'); plt.title('Distribution of Loan Repayment');

## Contract Type

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.NAME_CONTRACT_TYPE.values,data=train)
plt.xlabel('Contract Type'); plt.ylabel('Count'); plt.title('Distribution of Contract Types');

## Gender

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.CODE_GENDER.values,data=train)
plt.xlabel('Gender'); plt.ylabel('Number of Clients'); plt.title('Distribution of Gender');

## Education Type/Level

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.NAME_EDUCATION_TYPE.values,data=train)
plt.xlabel('Education Type/Level'); plt.ylabel('Number of Clients'); plt.title('Distribution of Education Type/Level');

## Car Ownership

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.FLAG_OWN_CAR.values,data=train)
plt.xlabel('Car Ownership (Y = Yes, N = No)'); plt.ylabel('Number of Clients'); plt.title('Distribution of Car Ownership');

## Home Ownership

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.FLAG_OWN_REALTY.values,data=train)
plt.xlabel('Home Ownership (Y = Yes, N = No)'); plt.ylabel('Number of Clients'); plt.title('Distribution of Home Ownership');

## Number of Children

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.CNT_CHILDREN.values,data=train)
plt.xlabel('Number of Children'); plt.ylabel('Number of Clients'); plt.title('Distribution of Children Per Client');

## Family Status

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.NAME_FAMILY_STATUS.values,data=train)
plt.xlabel('Family Status'); plt.ylabel('Number of Clients'); plt.title('Family Status Distribution');

## Housing Type

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.NAME_HOUSING_TYPE.values,data=train)
plt.xlabel('Housing Type'); plt.ylabel('Number of Clients'); plt.title('Housing Type Distribution');

## Age of Client

In [None]:
train['DAYS_BIRTH'] = abs(train['DAYS_BIRTH'])

plt.figure(figsize=(15,5))
sns.distplot(train['DAYS_BIRTH'] / 365,bins=5)
plt.xlabel('Age (Years)'); plt.ylabel('Number of Clients'); plt.title('Age Distribution');

## Age and Target

In [None]:
plt.figure(figsize=(15,5))

# KDE plot of loans that were repaid on time
sns.kdeplot(train.loc[train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

# KDE plot of loans which were not repaid on time
sns.kdeplot(train.loc[train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

In [None]:
plt.figure(figsize=(15,5))
sns.distplot(train['REGION_POPULATION_RELATIVE'],bins=5)

In [None]:
plt.figure(figsize=(15,5))

# KDE plot of loans that were repaid on time
sns.kdeplot(train.loc[train['TARGET'] == 0, 'REGION_POPULATION_RELATIVE'] / 365, label = 'target == 0')

# KDE plot of loans which were not repaid on time
sns.kdeplot(train.loc[train['TARGET'] == 1, 'REGION_POPULATION_RELATIVE'] / 365, label = 'target == 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

# Correlations with Target for Ext Source Variables

In [None]:
# Extract the EXT_SOURCE variables and show correlations
select_data = train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']]
data_corrs = select_data.corr()
data_corrs