In [None]:
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/drive/My Drive/german_credit_data.csv", low_memory=False)
df.head(3)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good


In [None]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
Unnamed: 0          1000 non-null int64
Age                 1000 non-null int64
Sex                 1000 non-null object
Job                 1000 non-null int64
Housing             1000 non-null object
Saving accounts     817 non-null object
Checking account    606 non-null object
Credit amount       1000 non-null int64
Duration            1000 non-null int64
Purpose             1000 non-null object
Risk                1000 non-null object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


(1000, 11)

In [None]:
#determining the NaN values in columns
df_null = pd.DataFrame({'Count': df.isnull().sum(), 'Percent': 100*df.isnull().sum()/len(df)})
df_null[df_null['Count'] > 0] 

Unnamed: 0,Count,Percent
Saving accounts,183,18.3
Checking account,394,39.4


In [None]:
df["Saving accounts"].fillna("No Saving account", inplace = True)
df["Checking account"].fillna("No Checking account", inplace = True)

In [None]:
df.head(3)


Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,No Saving account,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,No Checking account,2096,12,education,good


In [None]:
#categorizing the target variable
df.Risk=df.Risk.astype('category').cat.codes
df.head()
df['Risk'].value_counts()

1    700
0    300
Name: Risk, dtype: int64

In [None]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,No Saving account,little,1169,6,radio/TV,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,0
2,2,49,male,1,own,little,No Checking account,2096,12,education,1


In [None]:
#dummy variables for categorical features
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Risk,Sex_male,Housing_own,Housing_rent,Saving accounts_little,Saving accounts_moderate,Saving accounts_quite rich,Saving accounts_rich,Checking account_little,Checking account_moderate,Checking account_rich,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0,67,2,1169,6,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,1,22,2,5951,48,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
2,2,49,1,2096,12,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Risk',axis=1),df['Risk'],test_size=0.4,random_state=101)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
#Balancing the data
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_r, y_train_r = sm.fit_sample(X_train, y_train)



In [None]:
#Running the Gaussian Naive Bayes model
#gnb = GaussianNB()
#gnb.fit(x_train_r,y_train_r)

In [None]:
#print_score(gnb, x_train_r, y_train_r, X_test, y_test, train=False)

In [None]:
##Running the Decision Tree Algorithm
#dt_clf = DecisionTreeClassifier(random_state=21)
#dt_clf = dt_clf.fit(x_train_r,y_train_r)
#print_score(dt_clf, x_train_r, y_train_r, X_test, y_test, train=False)

In [None]:
#finding the best hyperparameter for decision tree algorithm
#sample_split_range = list(range(1, 50))
#max_depth = list(range(40,100,10))
#param_grid = dict(min_samples_split=sample_split_range,max_depth=max_depth)
#grid = GridSearchCV(dt_clf, param_grid, cv=10, scoring='accuracy')
#grid.fit(x_train_r, y_train_r)

In [None]:
#print(grid.best_score_)
#print(grid.best_params_)
#print(grid.best_estimator_)

In [None]:
dt_clf = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=40, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=19,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=21, splitter='best')
dt_clf = dt_clf.fit(x_train_r,y_train_r)
print_score(dt_clf, x_train_r, y_train_r, X_test, y_test, train=False)

Test Result:

accuracy score: 0.6667

Classification Report: 
               precision    recall  f1-score   support

           0       0.47      0.41      0.44       104
           1       0.74      0.78      0.76       226

    accuracy                           0.67       330
   macro avg       0.61      0.60      0.60       330
weighted avg       0.66      0.67      0.66       330


Confusion Matrix: 
 [[ 43  61]
 [ 49 177]]



In [None]:
#Running XGBoost Algorithm
xgb = XGBClassifier(learning_rate=0.1, max_depth=5,gamma=2,min_child_weight=1,nthread=3,n_estimators=200)
xgb.fit(x_train_r,y_train_r)
print_score(xgb, x_train_r, y_train_r, X_test, y_test, train=False)

Test Result:

accuracy score: 0.7121

Classification Report: 
               precision    recall  f1-score   support

           0       0.55      0.46      0.50       104
           1       0.77      0.83      0.80       226

    accuracy                           0.71       330
   macro avg       0.66      0.64      0.65       330
weighted avg       0.70      0.71      0.70       330


Confusion Matrix: 
 [[ 48  56]
 [ 39 187]]



GridSearchCV(cv=10, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=2,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=2, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [5, 10, 15],
                         'n_estimators': [40, 50, 60, 70, 80, 90]},
             pre_dispatch='2*n_jobs', refit=True, return_tra

0.8189603283173735
{'mean_fit_time': array([0.04199417, 0.05387514, 0.05886395, 0.06891041, 0.07826343,
       0.08790283, 0.08159027, 0.09360023, 0.11472991, 0.12748103,
       0.14420331, 0.16719038, 0.08436632, 0.10518494, 0.12547629,
       0.15370865, 0.16740589, 0.189101  ]), 'std_fit_time': array([0.00187057, 0.00697721, 0.00086081, 0.00480217, 0.00360795,
       0.00480642, 0.01316416, 0.00192608, 0.00574803, 0.00379272,
       0.00547301, 0.00876492, 0.00508366, 0.00432844, 0.0056146 ,
       0.01026786, 0.00760557, 0.0056067 ]), 'mean_score_time': array([0.00089097, 0.00101585, 0.00089328, 0.00093811, 0.0010546 ,
       0.00100796, 0.00112054, 0.0012871 , 0.00117276, 0.00114281,
       0.00118511, 0.00120018, 0.00107956, 0.00125444, 0.00159624,
       0.00130851, 0.0013037 , 0.00128825]), 'std_score_time': array([9.76783199e-05, 1.46162443e-04, 6.80396709e-05, 1.20104009e-04,
       1.37027687e-04, 1.35723723e-04, 1.41485124e-04, 3.04211125e-04,
       1.39073563e-04, 1.84426

In [None]:
#Running XGBoost Algorithm
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=2,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=50, n_jobs=1,
              nthread=2, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
xgb.fit(x_train_r,y_train_r)
print_score(xgb, x_train_r, y_train_r, X_test, y_test, train=False)

Test Result:

accuracy score: 0.7121

Classification Report: 
               precision    recall  f1-score   support

           0       0.56      0.42      0.48       104
           1       0.76      0.85      0.80       226

    accuracy                           0.71       330
   macro avg       0.66      0.63      0.64       330
weighted avg       0.70      0.71      0.70       330


Confusion Matrix: 
 [[ 44  60]
 [ 35 191]]



In [None]:
knn = KNeighborsClassifier(n_neighbors=5) #Running the KNN algorithm
knn.fit(x_train_r,y_train_r)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
print_score(knn, x_train_r, y_train_r, X_test, y_test, train=False)

Test Result:

accuracy score: 0.6650

Classification Report: 
               precision    recall  f1-score   support

           0       0.48      0.69      0.56       127
           1       0.82      0.66      0.73       273

    accuracy                           0.67       400
   macro avg       0.65      0.67      0.65       400
weighted avg       0.71      0.67      0.68       400


Confusion Matrix: 
 [[ 87  40]
 [ 94 179]]

