In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
import math
import pandas as pd
import seaborn as sns
import numpy as np

Kaggle dataset based on Credit Loan applications for homes. Fictitious client data correspond to the rows and dataset indicates whether or not these clients defaulted or on their loans or not. Our aim is to analyse and create a model that successfully predicts with certain degree of accuracy future or potential loan defaults.

In [None]:
credit = pd.read_csv('./data/application_train.csv', index_col = 0)

In [None]:
credit.shape

In [None]:
print(((credit['TARGET'])==0).sum())
print(((credit['TARGET'])==1).sum())
imbalance = (((credit['TARGET'])==1).sum()/((credit['TARGET'])==0).sum()*100)
imbalance
#checking for data imbalance
#highly imbalanced data, 9:1

Our dataset contained a highly imbalanced dataset. Our models were initally predicting with a misleadingly high accuracy of 98%.
we looked to methods in solving this issue; most simply we randomly removed a chunk of the data so that sample size for TARGET==1 was the same as TARGET==0 so the trained data had an equal sample to train on; thereby creating a realistic representation on accuracy

In [None]:
df=credit.drop(credit.query('TARGET == 0').sample(257861).index)
df.shape
#equalize sample size for training
#randomly remove a sample of x amount of rows that have target ==0

In [None]:
df.isna().sum()
df.isnull().sum()
#check for na values

In [None]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(default.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

missing_values = missing_values_table(df)
missing_values
    #which collumns have missing data

finding the correlation of each feature columns to the TARGET

In [None]:
#Get corr
df = pd.read_csv("application_train.csv" )
df_corr = pd.DataFrame( df.corr()['TARGET'])
df_corr['MissingValue'] = [df[list(df_corr.index)[x]].isna().sum() for x in range(len(pd.DataFrame(df_corr)['TARGET']))]
#df_corr = df_corr[df_corr['MissingValue']>30000].sort_values(by=['TARGET'])
df_corr['Abs_correlation'] = abs(df_corr['TARGET'])
df_corr.sort_values(by=['Abs_correlation'],ascending=False,inplace=True)

#get top 20
df_corr_top20 = df_corr[(df_corr['Abs_correlation']>=0.034199)]
df_corr_top20.drop('TARGET',inplace=True)
df_corr_top20.columns=['Correlation','Count_MissingValues','Abs_correlation']
df_corr_top20

use of heatmap to look for obvious patterns to assist us in finding correlation on potential features to focus on or remove.

In [None]:
top20 = list(df_corr_top20.index)
df_top20 = df[top20]
df_top20.insert(0, 'TARGET', df['TARGET'])
plt.figure(figsize=(40,12))
sns.heatmap(df_top20.corr(),annot=True)

EDA - Plotting the various features against the trend to identify the ones that have the most correlation and gain further insight to refine the model.


In [None]:
def plot_stats(feature,label_rotation=True,horizontal_layout=True):
    temp = credit[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})

    # Calculate the percentage of target=1 per category value
    cat_perc = credit[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
    cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
    
    if(horizontal_layout):
        fig, (ax2) = plt.subplots(ncols=1, figsize = (12,8))
    else:
        fig, (ax2) = plt.subplots(nrows=1, figsize =(12,8))
        sns.set_color_codes("pastel")
        #s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
    #if(label_rotation):
        #s.set_xticklabels(s.get_xticklabels(),rotation=90)
    
    s = sns.barplot(ax=ax2, x = feature, y='TARGET', order=cat_perc[feature], data=cat_perc)
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=90)
    plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
    plt.tick_params(axis='both', which='major', labelsize=10)

    plt.show();

In [None]:
plot_stats('NAME_CONTRACT_TYPE')

In [None]:
plot_stats('CODE_GENDER')

In [None]:
plot_stats('FLAG_OWN_CAR')
plot_stats('FLAG_OWN_REALTY')

In [None]:
plot_stats('NAME_FAMILY_STATUS')

In [None]:
plot_stats('NAME_INCOME_TYPE')

In [None]:
plot_stats('OCCUPATION_TYPE',True, False)

In [None]:
plot_stats("FLAG_OWN_CAR")

In [None]:
plot_stats("NAME_INCOME_TYPE")

In [None]:
plot_stats('NAME_EDUCATION_TYPE')

In [None]:
plot_stats("NAME_HOUSING_TYPE")

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(train["AMT_CREDIT"])

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_INCOME_TOTAL")
ax = sns.distplot(train["AMT_ANNUITY"].dropna())

In [None]:
# Find the correlation of the positive days since birth and target
train['DAYS_BIRTH'] = abs(train['DAYS_BIRTH'])
train['DAYS_BIRTH'].corr(train['TARGET'])

In [None]:
from scipy.stats import boxcox
from matplotlib import pyplot


np.log(train['AMT_INCOME_TOTAL']).sns.histplot(data=train['AMT_INCOME_TOTAL'], x="")

Pre Processing data

In [None]:
X = df.drop('TARGET', axis = 1)

y = df['TARGET']

large amount of missing data which we considered potentially important. A balance was found to impute the missing data with Simple Imputer instead of deleting columns completely as those columns had unverified influence on the TARGET

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy="mean") 
idf=pd.DataFrame(imputer.fit_transform(df))
idf.columns=df.columns
idf.index=df.index
df = idf
#Impute Nan values with mean

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
idf=pd.DataFrame(scaler.fit_transform(df))
idf.columns=df.columns
idf.index=df.index
df = idf
# summarize transformed data
np.set_printoptions(precision=3)
# Normalize value columns such as income

One Hot Encoding used to turn categorical data into numerical for more accurate modelling

In [None]:
X = pd.get_dummies(df, drop_first = True)

Training Data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

Modelling the data

Logistic Regression used with low classifier to reduce over fitting. 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(C = 0.0001)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)

print('Accuracy')
print(accuracy_score(y_test, y_pred_log))

In [None]:
print(f"Mean squared error:{mean_squared_error(y_test, y_pred_log): .2f}")
print(f"Root Mean squared error: {math.sqrt(mean_squared_error(y_test, y_pred_log)) :.2f}")
print(f'Variance score: {r2_score(y_test, y_pred_log):.2f}')
print(f"Mean absolute error:{mean_absolute_error(y_test, y_pred_log): .2f}")

In [1]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

NameError: name 'y_test' is not defined

In [None]:
k_range = range(1,31)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred_knn))
print(scores)

In [None]:
plt.plot(k_range,scores)

In [None]:
print(f"Mean squared error:{mean_squared_error(y_test, y_pred_knn): .2f}")
print(f"Root Mean squared error: {math.sqrt(mean_squared_error(y_test, y_pred_knn)) :.2f}")
print(f'Variance score: {r2_score(y_test, y_pred_knn):.2f}')
print(f"Mean absolute error:{mean_absolute_error(y_test, y_pred_knn): .2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rnd_clf = RandomForestClassifier(n_estimators=200, max_leaf_nodes=18, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_rf))

In [None]:
from sklearn.decomposition import PCA

pca = PCA()  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_  
print(explained_variance)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

X= train.drop(['TARGET'], axis=1)
y = train['TARGET']



#Oversampling the data
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size =0.2, random_state=42, stratify = y)

print(X.shape)
#XGB default settings resulted in the highest accuracy

model =XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy1 = accuracy_score(y_test,y_pred)

In [None]:
print('accuracy is %.4f with XGBBoost'% (accuracy1)) # no missing data

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn import preprocessing
#get_score identifies the features that the model put the highest weight on
importance_df = model.get_booster().get_score(importance_type='weight')

features_impt_xgb = pd.DataFrame(list(importance_df.items()), columns = ['FEATURE' , 'SCORE'])

print(features_impt_xgb.sort_values('SCORE', ascending = False).head(10))