In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Load dataset into panda DataFrame
df = pd.read_csv("german_credit_card.csv")
print(df.shape)
display(df.head())
display(df.describe())

(1000, 21)


Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2


Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.7,2.577,20.903,2.545,2.828,3271.248,2.105,3.384,2.973,2.682,...,2.845,2.358,35.542,2.675,1.928,1.407,2.904,1.155,1.404,1.037
std,0.458487,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,...,1.103718,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856
min,0.0,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,...,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,1.0,12.0,2.0,1.0,1365.5,1.0,3.0,2.0,2.0,...,2.0,1.0,27.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
50%,1.0,2.0,18.0,2.0,2.0,2319.5,1.0,3.0,3.0,3.0,...,3.0,2.0,33.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
75%,1.0,4.0,24.0,4.0,3.0,3972.25,3.0,5.0,4.0,3.0,...,4.0,3.0,42.0,3.0,2.0,2.0,3.0,1.0,2.0,1.0
max,1.0,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,...,4.0,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0


In [3]:
# Check for null values
df.isnull().sum()

Creditability                        0
Account Balance                      0
Duration of Credit (month)           0
Payment Status of Previous Credit    0
Purpose                              0
Credit Amount                        0
Value Savings/Stocks                 0
Length of current employment         0
Instalment per cent                  0
Sex & Marital Status                 0
Guarantors                           0
Duration in Current address          0
Most valuable available asset        0
Age (years)                          0
Concurrent Credits                   0
Type of apartment                    0
No of Credits at this Bank           0
Occupation                           0
No of dependents                     0
Telephone                            0
Foreign Worker                       0
dtype: int64

In [4]:
# Get columns from dataset 
# X = All columns except Creditability
X = np.array(df.iloc[:,2:])
# y = Creditability
y = np.array(df.iloc[:,0])

# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state = 20)

In [5]:
# Use Naive Bayes learning model - Gaussian
clf = GaussianNB()
clf.fit(X_train, y_train)
GaussianNB(priors=None)
pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, pred)*100
print('Accuracy of Gaussian NB model is equal to ' + str(round(accuracy, 2)) + ' %.')

Accuracy of Gaussian NB model is equal to 70.91 %.


In [6]:
# Use Naive Bayes learning model - Bernoulli
clf = BernoulliNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, pred)*100
print('Accuracy of Bernoulli NB model is equal to ' + str(round(accuracy, 2)) + ' %.')

Accuracy of Bernoulli NB model is equal to 74.24 %.


In [7]:
# Use Naive Bayes learning model - Multinomial
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, pred)*100
print('Accuracy of Multinomial NB model is equal to ' + str(round(accuracy, 2)) + ' %.')

Accuracy of Multinomial NB model is equal to 64.85 %.
