In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd

In [2]:
df_credit = pd.read_csv('./german_credit_data.csv', index_col=0)
df_credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [3]:
df_credit.shape

(1000, 10)

In [54]:
# df_credit.dropna(axis=0, how='any', inplace=True)
# after drop na (522,10), almost half the data is gone so not feasible
# df_credit.shape

In [4]:
# Check which column has nan values
df_credit.isna().any()

Age                 False
Sex                 False
Job                 False
Housing             False
Saving accounts      True
Checking account     True
Credit amount       False
Duration            False
Purpose             False
Risk                False
dtype: bool

In [5]:
df_credit.fillna(value='na', inplace=True)
df_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,na,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,na,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,na,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,na,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [6]:
df_credit[df_credit['Risk']=='good'].shape

(700, 10)

In [7]:
df_credit[df_credit['Risk']=='bad'].shape

(300, 10)

In [10]:
# Split the age into category

interval = (18, 25, 35, 60, 120)

cats = [1, 2, 3, 4]
df_credit["age_cat"] = pd.cut(df_credit['Age'], interval, labels=cats)

In [11]:
df_credit2 = df_credit.copy()

df_credit2

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,age_cat
0,67,male,2,own,na,little,1169,6,radio/TV,good,4
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,1
2,49,male,1,own,little,na,2096,12,education,good,3
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,3
4,53,male,2,free,little,little,4870,24,car,bad,3
...,...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,na,1736,12,furniture/equipment,good,2
996,40,male,3,own,little,little,3857,30,car,good,3
997,38,male,2,own,little,na,804,12,radio/TV,good,3
998,23,male,2,free,little,little,1845,45,radio/TV,bad,1


The data is biased towards Good.

Good credit risk consist of 70% of the dataset.

## Transforming text to int for categorial data

In [12]:
# With Ordinal Encoder and Label Encoder
target_encoder = preprocessing.LabelEncoder()
label = preprocessing.OrdinalEncoder()
df_credit2['sex_code'] = label.fit_transform(df_credit2[['Sex']])
df_credit2['housing_code'] = label.fit_transform(df_credit2[['Housing']])
df_credit2['savings_acc'] = label.fit_transform(df_credit2[['Saving accounts']])
df_credit2['checking_acc'] = label.fit_transform(df_credit2[['Checking account']])
df_credit2['purpose_code'] = label.fit_transform(df_credit2[['Purpose']])
                                                          
df_credit2['risk_code'] = target_encoder.fit_transform(df_credit2['Risk'])
df_credit2.head()


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,age_cat,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
0,67,male,2,own,na,little,1169,6,radio/TV,good,4,1.0,1.0,2.0,0.0,5.0,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,1,0.0,1.0,0.0,1.0,5.0,0
2,49,male,1,own,little,na,2096,12,education,good,3,1.0,1.0,0.0,2.0,3.0,1
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,3,1.0,0.0,0.0,0.0,4.0,1
4,53,male,2,free,little,little,4870,24,car,bad,3,1.0,0.0,0.0,0.0,1.0,0


In [13]:
df_credit2[df_credit2['checking_acc']==1]

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,age_cat,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,1,0.0,1.0,0.0,1.0,5.0,0
7,35,male,3,rent,little,moderate,6948,36,car,good,2,1.0,2.0,0.0,1.0,1.0,1
9,28,male,3,own,little,moderate,5234,30,car,bad,2,1.0,1.0,0.0,1.0,1.0,0
10,25,female,2,rent,little,moderate,1295,12,car,bad,1,0.0,2.0,0.0,1.0,1.0,0
12,22,female,2,own,little,moderate,1567,12,radio/TV,good,1,0.0,1.0,0.0,1.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,42,male,2,own,na,moderate,2427,18,business,good,3,1.0,1.0,2.0,1.0,0.0,1
979,25,male,2,rent,moderate,moderate,1264,15,car,bad,1,1.0,2.0,1.0,1.0,1.0,0
980,49,male,2,own,little,moderate,8386,30,furniture/equipment,bad,3,1.0,1.0,0.0,1.0,4.0,0
989,48,male,1,own,little,moderate,1743,24,radio/TV,good,3,1.0,1.0,0.0,1.0,5.0,1


In [14]:
df_credit2[df_credit2['Checking account']=='moderate']

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,age_cat,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,1,0.0,1.0,0.0,1.0,5.0,0
7,35,male,3,rent,little,moderate,6948,36,car,good,2,1.0,2.0,0.0,1.0,1.0,1
9,28,male,3,own,little,moderate,5234,30,car,bad,2,1.0,1.0,0.0,1.0,1.0,0
10,25,female,2,rent,little,moderate,1295,12,car,bad,1,0.0,2.0,0.0,1.0,1.0,0
12,22,female,2,own,little,moderate,1567,12,radio/TV,good,1,0.0,1.0,0.0,1.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,42,male,2,own,na,moderate,2427,18,business,good,3,1.0,1.0,2.0,1.0,0.0,1
979,25,male,2,rent,moderate,moderate,1264,15,car,bad,1,1.0,2.0,1.0,1.0,1.0,0
980,49,male,2,own,little,moderate,8386,30,furniture/equipment,bad,3,1.0,1.0,0.0,1.0,4.0,0
989,48,male,1,own,little,moderate,1743,24,radio/TV,good,3,1.0,1.0,0.0,1.0,5.0,1


In [69]:
df_credit2[df_credit2['Checking account']=='na']

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
2,49,male,1,own,little,na,2096,12,education,good,1.0,1.0,0.0,2.0,3.0,1
5,35,male,1,free,na,na,9055,36,education,good,1.0,0.0,2.0,2.0,3.0,1
6,53,male,2,own,quite rich,na,2835,24,furniture/equipment,good,1.0,1.0,3.0,2.0,4.0,1
8,61,male,1,own,rich,na,3059,12,radio/TV,good,1.0,1.0,4.0,2.0,5.0,1
16,53,male,2,own,na,na,2424,24,radio/TV,good,1.0,1.0,2.0,2.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,37,male,1,own,na,na,3565,12,education,good,1.0,1.0,2.0,2.0,3.0,1
991,34,male,1,own,moderate,na,1569,15,radio/TV,good,1.0,1.0,1.0,2.0,5.0,1
994,50,male,2,own,na,na,2390,12,car,good,1.0,1.0,2.0,2.0,1.0,1
995,31,female,1,own,little,na,1736,12,furniture/equipment,good,0.0,1.0,0.0,2.0,4.0,1


In [15]:
x = df_credit2.drop(['Age', 'Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Risk', 'risk_code'], axis=1)
y = df_credit2['risk_code']

In [16]:
x

Unnamed: 0,Job,Credit amount,Duration,age_cat,sex_code,housing_code,savings_acc,checking_acc,purpose_code
0,2,1169,6,4,1.0,1.0,2.0,0.0,5.0
1,2,5951,48,1,0.0,1.0,0.0,1.0,5.0
2,1,2096,12,3,1.0,1.0,0.0,2.0,3.0
3,2,7882,42,3,1.0,0.0,0.0,0.0,4.0
4,2,4870,24,3,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
995,1,1736,12,2,0.0,1.0,0.0,2.0,4.0
996,3,3857,30,3,1.0,1.0,0.0,0.0,1.0
997,2,804,12,3,1.0,1.0,0.0,2.0,5.0
998,2,1845,45,1,1.0,0.0,0.0,0.0,5.0


In [17]:
y

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: risk_code, Length: 1000, dtype: int64

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

## Logistic Regression

In [19]:
clf = LogisticRegression(random_state=42, tol=1e-5)
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
predicted = clf.predict(x_test)
result = confusion_matrix(y_test, predicted)
print(result)
accuracy = accuracy_score(y_test, predicted)
print("accuracy score: " + accuracy.astype(str))
report = classification_report(y_test, predicted)
print(report)

[[ 26  46]
 [ 17 161]]
accuracy score: 0.748
              precision    recall  f1-score   support

           0       0.60      0.36      0.45        72
           1       0.78      0.90      0.84       178

    accuracy                           0.75       250
   macro avg       0.69      0.63      0.64       250
weighted avg       0.73      0.75      0.73       250



## Neural Network

In [24]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(tol=1e-3)
nn.fit(x_train, y_train)

predicted_nn = nn.predict(x_test)

In [25]:
result = confusion_matrix(y_test, predicted_nn)
print(result)
accuracy = accuracy_score(y_test, predicted_nn)
print("accuracy score: " + accuracy.astype(str))
report = classification_report(y_test, predicted_nn)
print(report)

[[ 43  29]
 [109  69]]
accuracy score: 0.448
              precision    recall  f1-score   support

           0       0.28      0.60      0.38        72
           1       0.70      0.39      0.50       178

    accuracy                           0.45       250
   macro avg       0.49      0.49      0.44       250
weighted avg       0.58      0.45      0.47       250



## SVM

In [26]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(x_train, y_train)



In [27]:
predicted = clf.predict(x_test)
result = confusion_matrix(y_test, predicted)
print(result)
accuracy = accuracy_score(y_test, predicted)
print("accuracy score: " + accuracy.astype(str))
report = classification_report(y_test, predicted)
print(report)

[[  0  72]
 [  0 178]]
accuracy score: 0.712
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        72
           1       0.71      1.00      0.83       178

    accuracy                           0.71       250
   macro avg       0.36      0.50      0.42       250
weighted avg       0.51      0.71      0.59       250



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
