In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# numpy and pandas for data manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system management
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# List files available
print(os.listdir("../input/"))

In [None]:
# Training data
app_train = pd.read_csv('../input/application_train.csv')
print("Training data shape: ", app_train.shape)
app_train.head()

In [None]:
app_train.describe()

In [None]:
# Test data
app_test = pd.read_csv('../input/application_test.csv')
print("Test data shape: ", app_test.shape)
app_test.head()

# Exploratory Data Analysis (EDA)
EDA is an open-ended process where we calculate statistics and anomalies to discover what data can tell us. For example, the author looks at the "TARGET" column to discover that we have an imbalance class problem. I will need to understand exactly what it is and why it is a problem by reading the article he provides.

In [None]:
# this just count how many observations in the column
app_train['TARGET'].count()
# this counts how many observation of each type in the column
app_train['TARGET'].value_counts()

In [None]:
# plot the histogram of the "TARGET" column
app_train['TARGET'].astype(int).plot.hist()
plt.show()
# diagram shows an imblance class problem
# Focus 1: Implement several current research on how to deal with this type of problem

# Checkpoint 1: Write a function that returns summary of missing values
Save this code for my future reference

In [None]:
def missing_values_table(df):
    """
    missing_values_table counts the total number of missing values and their percentages
    in each column.
    Inputs:
    - df--pandas dataframe: a data frame of which its missing values are under investigation.
    
    Outputs:
    - missing_values_tab--pandas dataframe: table of total number of missing values together 
    with their percentages in each column and some printout details.
    
    Acknowledgement: Based on Will Koehrsen's Kaggle kernel with slight simplifications.
    """
    # Count the number of missing values in each column
    missing_value_counts = df.isnull().sum() # pandas series
    
    # Find the percentages of missing values within each column
    missing_value_percentage = (100 * missing_value_counts/len(df)).round(1) # pandas series
    
    # Make a table of missing values with keys to acces. 
    # pd.concat helps concatenate two pandas series.
    missing_values_tab = pd.concat([missing_value_counts, missing_value_percentage], 
                                   axis = 1, keys=['Missing Values','% of Total Values'])
    
    # Sort the table by percentage of missing descending. There are tons of methods.
    # One method is sort_values(inplace=True). Here, try a new method just learned
    missing_values_tab = missing_values_tab[missing_values_tab.iloc[:,1] != 0].sort_values(
                         '% of Total Values', ascending=False)
    
    # Print some summary information
    print("Your selected data frame has %d columns." %(df.shape[1]))
    print("There are %d columns that have missing values." %(missing_values_tab.shape[0]))
    
    return missing_values_tab

In [None]:
missing_values = missing_values_table(app_train)

# Encoding Categorical Variables

In [None]:
# Number of each type of column
app_train.dtypes.value_counts()

In [None]:
# Number of unique classes in each object.
# Note that the apply method takes in a function. Also it ignores NaN values!
# select_dtypes select the data of the type we want. Here, type object
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

In [None]:
# One-hot encoding: this method encodes hot for a specific type and 0 for the others
# It is the safest approach when dealing with categorical values since it doesn't
# impose arbitrary values to categories.
# Problem: add extra dimensions to the data set which makes our models be exposed to noise.
# To avoid: can use PCA and other dimensionality reduction methods
# For now we will use Label Encoding for categorical variables with only 2 categories and
# One-Hot Encoding for other categorical variables.
lb = LabelEncoder()
lb_count = 0
lb_name = []
# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # if 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # train on the training data
            lb.fit(app_train[col]) # use entries in the colummn to define appropriate labels
            # transform both training and test data
            app_train[col] = lb.transform(app_train[col])
            app_test[col] = lb.transform(app_test[col])
            
            # keep track of how many columns were label encoded
            lb_count += 1
            lb_name.append(col)

print("Number of columns were label-encoded:", lb_count)
print("Name of columns were label-encoded:", lb_name)

# Note that the data set has been modified, so if we rerun this cell, it will show 0.
# Note also that although EMERGENCYSTATE_MODE above shows 2 types of value, it actually has 3 types of values. The value that
# was not shown was NaN and was ignored in when we called the apply method.

In [None]:
# Implement one-hot encoding to our training data. Note that NaN values are still ignored
dim_before = len(app_train.columns)
print("Number of columns of training data BEFORE one-hot encoding:", dim_before)

app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test) # do the same thing for app_test

dim_after = len(app_train.columns)
print("Number of columns of training data AFTER one-hot encoding:", dim_after)

print("\nNumber of columns of test data after one-hot encoding: ", len(app_test.columns))

# There is a mismatch in the number of dimensions of training data vs test data. Check which one.
for col in app_train.columns:
    if col not in app_test.columns:
        print(col)

In [None]:
# Aligning training and test data process. Very very cool!

# Extract target variable
train_labels = app_train['TARGET']
# Align the training and test data. Don't forget to set axis = 1 to align columns only
app_train, app_test = app_train.align(app_test, axis = 1, join = 'inner')
# Put the target back in
app_train['TARGET'] = train_labels

print('Shape of training data:', app_train.shape)
print('Shape of test data:', app_test.shape)

# Handle Anomalies
We want to identify anomalies by looking at each column

In [None]:
# Inspect DAYS_BIRTH anomalies in training data. This looks normal
(app_train['DAYS_BIRTH'] / -365).describe()

In [None]:
# Inspect DAYS_EMPLOYED and observe that the column doesn't look normal. The max should be negative and should not be that big.
app_train['DAYS_EMPLOYED'].describe()

In [None]:
app_train['DAYS_EMPLOYED'].plot.hist(title = "Days Employment Histogram")
plt.xlabel("Days Employment")
plt.show()

In [None]:
# Out of curiosity, let's see if the clients of anomaly subset tend to have higher defaults than those of non-anomaly subset
anom = app_train[app_train['DAYS_EMPLOYED'] == 365243]
non_anom = app_train[app_train['DAYS_EMPLOYED'] != 365243]
print("The non-anomalies default on %0.2f%% of loans" %(100 * non_anom['TARGET'].mean()))
print("The anomalies default on %0.2f%% of loans" %(100 * anom['TARGET'].mean()))
print("There are %d anomalous data points" %(len(anom)))

# It looks like anomalies less default on loans than non-anomalies

In [None]:
# Handling anomalies depends on the exact situation. However,one of the safest approaches is to set them to missing
# values and then have them filled in (using imputation) before machine learning.
# In this case, since all the anomalies share the same value, we want to fill them with one common value.
# First, we'll fill these anomalies with np.nan. Later, we'll replace NaN's with an appropriate number.
# Thus, we also need to create an idicating column for these anomalies

# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243

# Replace anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# Plot the histogram
app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram')
plt.xlabel('Days employed')
plt.show()

In [None]:
# Do the same thing to test data. This is very important but people just keep forget all the time!
# Most machine learning platforms now will spit out some results no matter what, which is extremely dangerous!
# Is there anyway to check for dimensions of traning data and test data to make sure that this won't happen???

app_test['DAYS_EMPLOYED_ANOM'] = app_test['DAYS_EMPLOYED'] == 365243
app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace= True)
print('There are %d anomalies in the test data out of %d data points' %(app_test['DAYS_EMPLOYED'].isnull().sum(), len(app_test)))

# Study Correlations
Another way for us to understand more about the data is looking at the correlations between features and the target

In [None]:
# Find the correlations with the target and sort
correlations = app_train.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations: \n', correlations.tail(15))
print('\nMost Negative Correlations: \n', correlations.head(15))

In [None]:
# Note that TARGET and DAYS_BIRTH are highly correlated.
# This makes sense since as people get older, they are more responsible with their loans.
# Note that since 1 represents default and 0 represents repaid, we need to adjust our data
# a little bit
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

In [None]:
# We now study the distribution of DAYS_BIRTH

# Set the style of plots
plt.style.use('fivethirtyeight')

# Plot the distribution of ages in years
plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count')
plt.show()

In [None]:
# Use kde to plot the distribution of DAYS_BIRTH
plt.figure(figsize=(10,8))

# kde plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

# kde plot of loans that were default
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# Labeling of plot
plt.title('Distribution of Age'); plt.xlabel('Age (years)'); plt.ylabel('Density')
plt.show()

In [None]:
# Average failure to repay loans by age bracket

# Separate age information into a separate dataframe
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365

# Bin the age data. I just learned more about np.linspace
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))

age_data.head()

In [None]:
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups['TARGET'].index.astype(str), 100 * age_groups['TARGET'])

# Adjust x axis
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group')
plt.show()

# This shows that the youngest folks are not very responsible for their loans at all

In [None]:
# We now study external sources because they have the strongest 
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
# Express ext_data_corrs in heatmap style
plt.figure(figsize = (8,6))

sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap')
plt.show()

# Note that all three external sources are negatively correlated with the TARGET variable, which
# indicates that as the (absolute) value increases, the applicant is more likely to repay the loan

In [None]:
# Next we plot the distributions of external data
plt.figure(figsize = (10, 12))

# Iterate through the sources
for idx, source in enumerate(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']):
    
    # Create a new subplot for each source
    plt.subplot(3, 1, idx + 1)
    # Plot the distribution of source that were repaid
    sns.kdeplot(ext_data.loc[ext_data['TARGET'] == 0, source], label = 'target == 0') # repaid
    # Plot the distribution of source that were default
    sns.kdeplot(ext_data.loc[ext_data['TARGET'] == 1, source], label = 'target == 1') # default
    
    plt.title('Distribution of %s by Target Value' % source)
    plt.xlabel('%s' % source); plt.ylabel('Density')
    
plt.tight_layout(h_pad = 4)

In [None]:
# Copy the data for plotting
plot_data = ext_data.drop(columns = ['DAYS_BIRTH']).copy()

# Add in the age of the client in years
plot_data['YEARS_BIRTH'] = age_data['YEARS_BIRTH']

# Drop na values and limit to first 100000 rows
plot_data = plot_data.dropna().loc[:100000, :]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,
                size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
                    hue = 'TARGET', 
                    vars = [x for x in list(plot_data.columns) if x != 'TARGET'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source and Age Features Pairs Plot', size = 32, y = 1.05);

# Feature Engineering
To cite a quote from Andrew Ng, "Applied machine learning is basically feature engineering."

In [None]:
# First, we need to fill out missing values (finally!!!)
# Remember two things: (1) never ever touch the target variable,
# and (2) always do the same thing with test data

# Get features that we want to transform
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]


# Imputer for handling missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median') # multiple imputation common in statistics

poly_target = poly_features['TARGET']
poly_features = poly_features.drop(columns = ['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.fit_transform(poly_features_test)

# Now we can transform our features
# Create the polynomial object with specified degree
from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree = 3)

In [None]:
# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

In [None]:
feature_names = poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
print(feature_names)

In [None]:
# Now want to see if any of these features are correlated with the target.

# Convert np array to dataframe
poly_features = pd.DataFrame(poly_features, columns = feature_names)
# Add in the target
poly_features['TARGET'] = poly_target
# Find correlations with the target variable
poly_corrs = poly_features.corr()['TARGET'].sort_values()
# Display most negative and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail())

In [None]:
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, columns = feature_names)

# Merge polynomial features into training dataframe
key = 'SK_ID_CURR'
poly_features[key] = app_train[key] # key to join
app_train_poly = app_train.merge(poly_features, on = key, how = 'left')

# Merge polynomial features into test dataframe
poly_features_test[key] = app_test[key]
app_test_poly = app_test.merge(poly_features_test, on = key, how = 'left')

# Algin the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape:', app_train_poly.shape)
print('Test data with polynomial features shape:', app_test_poly.shape)

# Domain knowledge feature
Copy these codes because we don't have that kind of expertise. Note that we add columns to the original

In [None]:
# For training data. Think how I can write a function for this. I don't like repeating like this!
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']

# For test data
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

In [None]:
print(app_train_domain.shape)
print(app_test_domain.shape)


In [None]:
# Let's visualize these new variables by plotting their distributions with kde method

# Creat a new plot
plt.figure(figsize = (10, 12))

# Plot the distributions with kde method in seaborn library
for idx, source in enumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT',
                             'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):
    # Create a subplot for each source
    plt.subplot(4, 1, idx + 1)
    # Distribution of repaid loans
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, source], label = 'target == 0')
    # Distribution of default loans
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, source], label = 'target == 1')
    # Label each subplot
    plt.xlabel('%s' % source); plt.ylabel('Density')
    plt.title('Distribution of %s by Target Value' % source)
    
plt.tight_layout(h_pad = 2.5)

# Classification tasks
The following list all the models that will be used. For each model, we will simultaneously try on the original data, data with polynomial features, and data with domain-knowledge features.
1. Baseline model
2. Logistic regression

In [None]:
# 1. Baseline model
# This is purely guessing. For each case, flip a coin. If, say, H, then declare 0. If, say, T, declare, 0.

In [None]:
# 2. Logistic regression
# We'll start by imputing missing values in app_train. Ahhh! Never ever touch the target variable

from sklearn.preprocessing import MinMaxScaler, Imputer

# Isolate the target variable
if 'TARGET' in app_train: # This code is smart because it is polymorphic
    train = app_train.drop(columns=['TARGET'])
    print('TARGET was in app_train')
    print('TARGET would be dropped from app_train')
else:
    train = app_train.copy()
    print('TARGET was not in app_train')
    print('TARGET would be added to app_train before being dropped for train')

print('\n----------------------NEXT STEP----------------------\n')

# Feature names
features = list(train.columns)

# Imputation
imputer = Imputer(strategy = 'median')
imputer.fit(train)
print('BEFORE IMPUTATION...')
missing_values_table(train)
print()
print('AFTER IMPUTATION...')
train = imputer.transform(train)
missing_values_table(pd.DataFrame(train))

# Scaling
scaler = MinMaxScaler()
scaler.fit(train)
train = scaler.transform(train)

# Do the same thing for test data
test = imputer.transform(app_test)
test = scaler.transform(test)

print()
print('Training data shape:', train.shape)
print('Test data shape:', test.shape)



In [None]:
# Now logistic regression
from sklearn.linear_model import LogisticRegression
# Step 1: Create the model object
log_reg = LogisticRegression(C = 0.001) # C is the regularization parameter
# Step 2: Fit on training data
log_reg.fit(train, train_labels)

In [None]:
# Step 3: Predict
log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [None]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred
submit.head()

In [None]:
# Save the submission to a csv file
submit.to_csv('log_reg_baseline.csv', index = False)

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier # Note that RandomForestRegressor is for regression tasks

# Step 1: Define the model
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

# Step 2: Fit the model on training data
random_forest.fit(train, train_labels)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Step 3: Make predictions on the test data
predictions = random_forest.predict_proba(test)[:, 1]

In [None]:
# Make the submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline.csv', index = False)

In [None]:
submit.head()

In [None]:
feature_importances.sort_values('importance', ascending=False).head(10)

In [None]:
# Let's predict on polynomial features

# Extract feature names
poly_features_names = list(app_train_poly.columns)

# Imputation
imputer = Imputer(strategy='median')
poly_features_train = imputer.fit_transform(app_train_poly)
poly_features_test  = imputer.transform(app_test_poly)

# Scale the polynomial features
scaler = MinMaxScaler()
poly_features_train = scaler.fit_transform(poly_features_train)
poly_features_test  = scaler.transform(poly_features_test)

In [None]:
# Step 1: Define the model
random_forest_poly = RandomForestClassifier(n_estimators=100, random_state=50, verbose=1, n_jobs=-1)

# Step 2: Fit the model on training data
random_forest_poly.fit(poly_features_train, train_labels)

# Step 3: Make predictions on the test data
poly_predictions = random_forest_poly.predict_proba(poly_features_test)[:, 1]

In [None]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline_engineered.csv', index = False)