In [1]:
#Importing necessary librarires
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np

In [2]:
# importing data from  CSV file
dataset=pd.read_csv('C:\\Users\\Hp\\Desktop\\Ena AI Project German-credit-risk-Prediction\\german_credit_data1.csv') 

dataset.shape #displays the shape of the dataset
dataset.sample(10) #displays the details from random 10 rows
dataset.rename(columns={'Unnamed: 0': 'ID'},inplace=True) #renames the first row unnamed to ID
dataset.describe(include='all') #provides an information overview for all columns
dataset.head() #displays the initial details of the data set

Unnamed: 0,ID,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [3]:
dataset.index #displays the range of index
dataset.set_index('ID',inplace=True) #sets the index to ID so we can ignore tht column later 
dataset.sample(5)

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
99,36,male,3,rent,,moderate,7057,20,car,good
540,23,male,2,rent,little,moderate,1534,12,radio/TV,bad
259,57,female,1,own,moderate,,1154,11,radio/TV,good
463,38,male,2,own,,moderate,754,12,education,good
680,56,female,2,own,little,,1538,6,education,good


In [4]:
#Target is Risk with values good or bad. We will set 1 to represent good and 0 bad in terms of risk

risk_mappings={'good':1,'bad':0}
dataset['Risk']=dataset['Risk'].map(risk_mappings) #Using a mapping technique to convert category from string to int
dataset.head()

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,67,male,2,own,,little,1169,6,radio/TV,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,0
2,49,male,1,own,little,,2096,12,education,1
3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,53,male,2,free,little,little,4870,24,car,0


In [5]:
#converting the age attribute from a continuous to a categorical values 
dataset['Age']=dataset['Age'].fillna('adult') #Takes care of missing values by filling in the Nan as adult
cut_points=[19,30,40,50,60,100] #Divides into categories based on age range i.e. binning
label_names=['adult','middle_age','above_middle_age','old_age','senior_citizen'] #provides a label i.e. categorical value to each range
dataset['Age']=pd.cut(dataset['Age'],cut_points,labels=label_names)
dataset.head()

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,senior_citizen,male,2,own,,little,1169,6,radio/TV,1
1,adult,female,2,own,little,moderate,5951,48,radio/TV,0
2,above_middle_age,male,1,own,little,,2096,12,education,1
3,above_middle_age,male,2,free,little,little,7882,42,furniture/equipment,1
4,old_age,male,2,free,little,little,4870,24,car,0


In [6]:
#There are missing values in column Savings Account, thus we can use the ffill method to manage them

dataset['Saving accounts'].fillna(method='ffill',inplace=True) #replacies the Nan by ffill method
dataset['Saving accounts'].unique() #displays the unique values in the savings account
dataset['Saving accounts'].value_counts() #displays the number of unique values in the savings account 

little        740
moderate      127
quite rich     75
rich           57
Name: Saving accounts, dtype: int64

In [7]:
#Same missing value replacement steps as with Savings Account, but for Checking account
dataset['Checking account'].fillna(method='bfill',inplace=True) 
dataset['Checking account'].unique()
dataset['Checking account'].value_counts()


little      460
moderate    430
rich        110
Name: Checking account, dtype: int64

In [8]:
#Converthing String to Integer categorical values as main part of the data cleaning stage
#Age conversion by using mapping technique
age_mappings={'senior_citizen':4, 'adult':0, 'above_middle_age':2, 'old_age':3, 'middle_age':1}
dataset['Age']=dataset['Age'].map(age_mappings)

In [9]:
#Gender to categorical values
dataset['Sex'].unique()
sex_mappings={'male':1,'female':0}
dataset["Sex"]=dataset["Sex"].map(sex_mappings)

In [10]:
# Housing to categorical values
dataset["Housing"].unique()
Housing_mappings={'own':2,'free':1,'rent':0}
dataset["Housing"]=dataset['Housing'].map(Housing_mappings)


In [11]:

# savings account to categorical values
dataset=dataset.dropna()
dataset['Saving accounts'].unique()
saving_mappings={'little':0,'moderate':1,'quite rich':2,'rich':3}
dataset['Saving accounts']=dataset['Saving accounts'].map(saving_mappings)


In [12]:

# checking account to categorical 
dataset['Checking account'].unique()
dataset['Checking account'].describe()
checking_mappings={'little':0,'moderate':1,'rich':2}
dataset['Checking account']=dataset['Checking account'].map(checking_mappings)


In [13]:

# conversion of credited amount to categorical values with a pd.cut method wherewe can customize range of our own
cut_points=[0,2000,3000,6000,8000,20000]
labels=['too small','small','big','too big','bigger']
dataset['Credit amount']=pd.cut(dataset['Credit amount'],cut_points,labels=labels)
amount_mappings={'big':2, 'small':1, "too big":3, "bigger":4, 'too small':0}
dataset['Credit amount']=dataset['Credit amount'].map(amount_mappings)



In [14]:

# Duration to categorical values with a pd.cut method wherewe can customize range of our own
cut_points=[0,10,20,40,60,100]
labels=[0,1,2,3,4]
dataset['Duration']=pd.cut(dataset['Duration'],cut_points,labels=labels)

In [15]:
#Purpose to categorical
amount_mappings={'radio/TV':1, 'education':0, 'furniture/equipment':2, 'car':3, 'business':4,
       'domestic appliances':5, 'repairs':6, 'vacation/others':7}
dataset['Purpose']=dataset['Purpose'].map(amount_mappings)

In [17]:
dataset.sample(30)

Unnamed: 0_level_0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
664,2,0,1,2,0,2,0,0,0,1
260,0,1,2,2,0,0,0,1,2,1
299,1,1,2,2,3,1,1,2,2,1
671,1,1,2,2,1,1,2,2,4,1
277,2,1,1,2,0,0,0,1,2,1
974,1,0,2,2,0,2,1,2,1,1
459,1,1,2,2,0,0,2,1,1,1
320,0,1,3,2,0,1,2,2,3,0
349,1,1,2,1,3,1,0,0,0,0
734,1,0,2,2,0,1,2,0,3,1


In [18]:
#Training Step
from sklearn.model_selection import train_test_split

testset=dataset['Risk']
dataset=dataset.drop(['Risk'],axis=1)
x_train,x_test,y_train,y_test=train_test_split(dataset,testset,test_size=0.3,random_state=0) #good practice to split it 30% for test and 70% for train

In [32]:
#Will try several supervised machine learning algorthims (classification) and evaluate their accuracy for our data set
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_test)
accuracyGaussNB = round(accuracy_score(y_pred, y_test) * 100, 2)
print ("The Accuracy of Gaussian Naive Bayes is:", accuracyGaussNB)

The Accuracy of Gaussian Naive Bayes is: 70.33


In [33]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier()
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_test)
accuracyDecisionTree = round(accuracy_score(y_pred, y_test) * 100, 2)
print ("The Accuracy of the Decision Tree is:", accuracyDecisionTree)

The Accuracy of the Decision Tree is: 58.33


In [34]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
accuracyLogisticRegression = round(accuracy_score(y_pred, y_test) * 100, 2)
print("The Accuracy of the Logistic Regression is:", accuracyLogisticRegression)


The Accuracy of the Logistic Regression is: 70.0




In [35]:
#To preview the accuracies of the three chosen methods in one table, we can create a model
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes',
              'Decision Tree'],
    'Score': [accuracyLogisticRegression, accuracyGaussNB, accuracyDecisionTree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
1,Naive Bayes,70.33
0,Logistic Regression,70.0
2,Decision Tree,58.33


In [36]:
print("The Naive Bayes and the Logistic Regression both scored about a 70% accuracy. The decision tree however, scored a bit under 60%")


The Naive Bayes and the Logistic Regression both scored about a 70% accuracy. The decision tree however, scored a bit under 60%
