In [None]:
# Load required libraries
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
#Check working directory
os.getcwd()

In [None]:
# Set working directory
os.chdir("E:\project")

In [None]:
# Load the data
loan_data = pd.read_csv("bank-loan.csv")

In [None]:
# tells unique value count in each variable
for feature in loan_data.columns:
    print(feature ,':', loan_data[feature].nunique())

In [None]:
# Convert data to required data types
loan_data['ed'] = loan_data['ed'].astype('category')
loan_data['default'] = loan_data['default'].astype('category')

In [None]:
# Dividing the train and test data-set.
# here train set will be used to bulid and test the Model
# The Model will be used to predict the value of target column in the test set
train = pd.DataFrame(loan_data.loc[pd.notnull(loan_data['default'])])
test = pd.DataFrame(loan_data.loc[pd.isnull(loan_data['default'])])
# removing default variable from test
test = test.drop([test.columns[-1]], axis=1)

In [None]:
# Saving Numerical and categorical variables in different list
num_var = train.select_dtypes([np.number]).columns
cat_var = train.select_dtypes(['category']).columns

# Exploratory Data Analysis

In [None]:
# taking a look at the data
# gives first five rows of data set
train.head()

In [None]:
test.head()

In [None]:
# Getting the number of variables and obervation in the train data-set
train.shape

In [None]:
#checking types
train.dtypes

In [None]:
# Getting further info about the train data-set
# data type of data
train.info()

In [None]:
# getting summary of the train data-set
train.describe()

In [None]:
# Getting the number of variables and obervation in the test data-set
test.shape

In [None]:
#checking types
test.dtypes

In [None]:
# Getting the info about the test data-set
test.info()

In [None]:
# getting summary of test data-set
test.describe()

In [None]:
# Unique values in a column
train[train.columns[-1]].unique()

In [None]:
# Count of unique values in a column
train[train.columns[-1]].nunique()

In [None]:
train[train.columns[-1]].value_counts()
# we have 517 '0' values and 183 '1' values
# we can see that the data is unbalanced

In [None]:
# setting grid for all charts
sns.set_style('darkgrid')
# visualizing target variable
plt.figure(figsize=(5,5))
train[train.columns[-1]].value_counts().plot.pie(autopct='%1.2f%%')

In [None]:
# categorical variable of train data set
plt.figure(figsize=(14,5))
for i,col in enumerate(cat_var):
    plt.subplot(1,2,i+1)
    sns.countplot(train[col], alpha=0.6)  
    plt.xlabel(col,fontsize=20)
    plt.ylabel('count',fontsize=20)

In [None]:
# categorical variable of test data set
plt.figure(figsize=(14,5))
for i,col in enumerate(cat_var[0:-1]):
    plt.subplot(1,2,i+1)
    sns.countplot(test[col], alpha=0.6)
    plt.xlabel(col,fontsize=20)
    plt.ylabel('count',fontsize=20)

In [None]:
# Scatter plot
g = sns.PairGrid(train[train.select_dtypes([np.number]).columns])
g.map_diag(plt.hist, color='orange', alpha=0.8)
g.map_upper(plt.scatter)
g.map_lower(sns.lineplot)

In [None]:
# draw histograms of numeric data in training set 
print("Distribution of Train features")
plt.figure(figsize=(30,40))
for i,col in enumerate(num_var):
    plt.subplot(4,2,i+1)
    #plt.hist(train[col], bins='auto')
    sns.distplot(train[col], kde=False)
    plt.xlabel(col,fontsize=30)

In [None]:
# draw histograms of numeric data in Test set 
print("Distribution of Test features")
plt.figure(figsize=(30,40))
for i,col in enumerate(num_var):
    plt.subplot(4,2,i+1)
    #plt.hist(train[col], bins='auto', color='orange')
    sns.distplot(test[col], kde=False, color='orange')
    plt.xlabel(col,fontsize=30)

In [None]:
# Distribution of features per target class
# Kerner Density Estimate (KDE)
print("Distribution of features per target class")
plt.figure(figsize=(30,40))
for i,col in enumerate(num_var):
    plt.subplot(4,2,i+1)
    sns.distplot(train[train[train.columns[-1]]==0][col],hist=False,label='0',color='blue')
    sns.distplot(train[train[train.columns[-1]]==1][col],hist=False,label='1',color='orange')
    plt.xlabel(col,fontsize=30)

In [None]:
# Distribution of features for test and train dataset
print("Distribution of features for test and train dataset")
plt.figure(figsize=(30,40))
for i,col in enumerate(num_var):
    plt.subplot(4,2,i+1)
    sns.distplot(train[col],hist=False,label='train',color='blue')
    sns.distplot(test[col],hist=False,label='test',color='orange')
    plt.xlabel(col,fontsize=30)

In [None]:
# draw boxplot of numeric data in Train set 
print("Boxplot of Train features")
plt.figure(figsize=(30,40))
for i,col in enumerate(train.columns.values[0:-1]):
    plt.subplot(4,2,i+1)
    sns.boxplot(x=train.columns[-1], y=col, data=train, orient='v')
    plt.xlabel(train.columns[-1],fontsize=30)
    plt.ylabel(col,fontsize=30)
# from plots we can clearly see that there are outliers in our dataset

# Missing Value Analysis

In [None]:
# getting count of missing values in train
train.isnull().sum()

In [None]:
# getting count of missing values in test
test.isnull().sum()

# Outlier Analysis

In [None]:
# # handling outliers in Train data-set
# Q1 = train.quantile(0.25)
# Q3 = train.quantile(0.75)
# IQR = Q3 - Q1

# for feature in num_var:
#     train.loc[(train[feature] < (Q1[feature] - 1.5 * IQR[feature])) | 
#               (train[feature] > (Q3[feature] + 1.5 * IQR[feature])),feature] = np.nan
    
#     #Impute with mean
#     #train[column_name] = train[column_name].fillna(train[column_name].mean())

#     #Impute with median
#     train[feature] = train[feature].fillna(train[feature].median())

In [None]:
# # handling outliers in Test data-set
# Q1 = test.quantile(0.25)
# Q3 = test.quantile(0.75)
# IQR = Q3 - Q1

# for feature in num_var:
#     test.loc[(test[feature] < (Q1[feature] - 1.5 * IQR[feature])) | 
#               (test[feature] > (Q3[feature] + 1.5 * IQR[feature])),feature] = np.nan
    
#     #Impute with mean
#     #test[feature] = test[feature].fillna(test[feature].mean())

#     #Impute with median
#     test[feature] = test[feature].fillna(test[feature].median())

# Feature Selection

In [None]:
# Correlation plot
corr = train.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(10, 8))
    ax = sns.heatmap(corr, mask=mask, cmap=sns.diverging_palette(200, 20, as_cmap=True), square=True, annot=True)

In [None]:
np.fill_diagonal(corr.values,np.nan)
corr.max().max(),corr.min().min()

In [None]:
# Chisquare test of independence
from scipy.stats import chi2_contingency
# loop for chi square values
for i in cat_var:
    print(i)
    chi2, p, dof, ex = chi2_contingency(pd.crosstab(train[cat_var[-1]], train[i]))
    print(p)
# replace variables whose p-value is more than 0.05

In [None]:
# creating dummyvariables for categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
train = pd.DataFrame(ct.fit_transform(train))
test = pd.DataFrame(ct.fit_transform(test))

In [None]:
test.insert(4, 'col', 0)
test.columns = range(test.shape[1])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train = train.astype(np.integer)
test = test.astype(np.integer)

# Sampling

In [None]:
# saving dependent and independent variable in different array for model training
X = train.iloc[:, 0:-1].values
y = train.iloc[:, -1].values
test_result = test.iloc[:, 0:].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# Scaling

In [None]:
# another method of scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
test_result = sc.transform(test_result)

# Model Development

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

def classification_model(model):
    
    select_model = model
    
    if select_model=='logistic':
        # Fitting Logistic Regression to the Training set
        classifier = LogisticRegression(random_state = 42).fit(X_train, y_train)
        
    elif select_model=='KNN':
        # Fitting K-NN to the Training set
        classifier = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2).fit(X_train, y_train)
    
    elif select_model=='NB':
        # Fitting Naive Bayes to the Training set
        classifier = GaussianNB().fit(X_train, y_train)
        
    elif select_model=='DT':
        # Fitting Decision Tree Classification to the Training set    
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 42).fit(X_train, y_train)
        
    elif select_model=='RF':
        # Fitting Random Forest Classification to the Training set
        classifier = RandomForestClassifier(n_estimators = 625,
                                               criterion = 'entropy', random_state = 42).fit(X_train, y_train)
    elif select_model=='SVC':    
        # Fitting SVM to the Training set
        classifier = SVC(kernel = 'rbf',C = 0.75, gamma = 0.2, random_state = 42).fit(X_train, y_train)
        
    else:
        classifier = 'Incorrect Input'
        
    return classifier

In [None]:
# Choose classifier

model_selected = 'logistic'
#model_selected = 'KNN'
#model_selected = 'NB'
#model_selected = 'DT'
#model_selected = 'RF'
#model_selected = 'SVC'

classifier = classification_model(model_selected)
print(classifier)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
# libraries to make pr and roc curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# let us save TP, TN, FP, FN
TN = cm[0,0]
FN = cm[1,0]
TP = cm[1,1]
FP = cm[0,1]

accuracy = accuracy_score(y_test, y_pred)
FNR = (FN*100)/(FN+TP)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)

print(cm)
print('Accuracy is: %0.4f' %accuracy)
print('FNR is: %0.3f' %FNR)
print('Precision is: %0.4f' %precision)
print('Recall is: %0.4f' %recall)
print('F1 Score is: %0.4f' %f1_score)

# plot Confusion Matrix
plot_confusion_matrix(classifier, X_test, y_test)
plt.title('Confusion Matrix')
plt.show()

# Receiver Operating Characteristic
# plot model roc curve 
plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
plt.plot(fpr, tpr, marker='.', lw=2, label='area = %0.2f' % roc_auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

#Precision-Recall Curve
disp = plot_precision_recall_curve(classifier, X_test, y_test)
disp.ax_.set_title('Precision-Recall curve')

# K-Fold Cross Validation

In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

# Predicting Test data-set and saving output file

In [None]:
test_pred = classifier.predict(test_result)

test['default'] = test_pred
# Writing a csv (output)
test_output.to_csv("test_output_python.csv", index = False)

In [None]:
# visualizing output target variable
sns.countplot(test[test.columns[-1]], alpha=0.6)

In [None]:
test[test.columns[-1]].value_counts()

In [None]:
test.head()