In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd

In [52]:
df_credit = pd.read_csv('./german_credit_data.csv', index_col=0)
df_credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [53]:
df_credit.shape

(1000, 10)

In [54]:
# df_credit.dropna(axis=0, how='any', inplace=True)
# after drop na (522,10), almost half the data is gone so not feasible
# df_credit.shape

In [55]:
# Check which column has nan values
df_credit.isna().any()

Age                 False
Sex                 False
Job                 False
Housing             False
Saving accounts      True
Checking account     True
Credit amount       False
Duration            False
Purpose             False
Risk                False
dtype: bool

In [56]:
df_credit.fillna(value='na', inplace=True)
df_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,na,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,na,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,na,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,na,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [57]:
df_credit[df_credit['Risk']=='good'].shape

(700, 10)

In [58]:
df_credit[df_credit['Risk']=='bad'].shape

(300, 10)

In [59]:
df_credit2 = df_credit.copy()

The data is biased towards Good.

Good credit risk consist of 70% of the dataset.

## Transforming text to int for categorial data

In [60]:
# With Ordinal Encoder and Label Encoder
target_encoder = preprocessing.LabelEncoder()
label = preprocessing.OrdinalEncoder()
df_credit2['sex_code'] = label.fit_transform(df_credit2[['Sex']])
df_credit2['housing_code'] = label.fit_transform(df_credit2[['Housing']])
df_credit2['savings_acc'] = label.fit_transform(df_credit2[['Saving accounts']])
df_credit2['checking_acc'] = label.fit_transform(df_credit2[['Checking account']])
df_credit2['purpose_code'] = label.fit_transform(df_credit2[['Purpose']])
                                                          
df_credit2['risk_code'] = target_encoder.fit_transform(df_credit2['Risk'])
df_credit2.head()


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
0,67,male,2,own,na,little,1169,6,radio/TV,good,1.0,1.0,2.0,0.0,5.0,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,0.0,1.0,0.0,1.0,5.0,0
2,49,male,1,own,little,na,2096,12,education,good,1.0,1.0,0.0,2.0,3.0,1
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,1.0,0.0,0.0,0.0,4.0,1
4,53,male,2,free,little,little,4870,24,car,bad,1.0,0.0,0.0,0.0,1.0,0


In [61]:
df_credit2[df_credit2['checking_acc']==1]

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,0.0,1.0,0.0,1.0,5.0,0
7,35,male,3,rent,little,moderate,6948,36,car,good,1.0,2.0,0.0,1.0,1.0,1
9,28,male,3,own,little,moderate,5234,30,car,bad,1.0,1.0,0.0,1.0,1.0,0
10,25,female,2,rent,little,moderate,1295,12,car,bad,0.0,2.0,0.0,1.0,1.0,0
12,22,female,2,own,little,moderate,1567,12,radio/TV,good,0.0,1.0,0.0,1.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,42,male,2,own,na,moderate,2427,18,business,good,1.0,1.0,2.0,1.0,0.0,1
979,25,male,2,rent,moderate,moderate,1264,15,car,bad,1.0,2.0,1.0,1.0,1.0,0
980,49,male,2,own,little,moderate,8386,30,furniture/equipment,bad,1.0,1.0,0.0,1.0,4.0,0
989,48,male,1,own,little,moderate,1743,24,radio/TV,good,1.0,1.0,0.0,1.0,5.0,1


In [62]:
df_credit2[df_credit2['Checking account']=='moderate']

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,0.0,1.0,0.0,1.0,5.0,0
7,35,male,3,rent,little,moderate,6948,36,car,good,1.0,2.0,0.0,1.0,1.0,1
9,28,male,3,own,little,moderate,5234,30,car,bad,1.0,1.0,0.0,1.0,1.0,0
10,25,female,2,rent,little,moderate,1295,12,car,bad,0.0,2.0,0.0,1.0,1.0,0
12,22,female,2,own,little,moderate,1567,12,radio/TV,good,0.0,1.0,0.0,1.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,42,male,2,own,na,moderate,2427,18,business,good,1.0,1.0,2.0,1.0,0.0,1
979,25,male,2,rent,moderate,moderate,1264,15,car,bad,1.0,2.0,1.0,1.0,1.0,0
980,49,male,2,own,little,moderate,8386,30,furniture/equipment,bad,1.0,1.0,0.0,1.0,4.0,0
989,48,male,1,own,little,moderate,1743,24,radio/TV,good,1.0,1.0,0.0,1.0,5.0,1


In [69]:
df_credit2[df_credit2['Checking account']=='na']

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,sex_code,housing_code,savings_acc,checking_acc,purpose_code,risk_code
2,49,male,1,own,little,na,2096,12,education,good,1.0,1.0,0.0,2.0,3.0,1
5,35,male,1,free,na,na,9055,36,education,good,1.0,0.0,2.0,2.0,3.0,1
6,53,male,2,own,quite rich,na,2835,24,furniture/equipment,good,1.0,1.0,3.0,2.0,4.0,1
8,61,male,1,own,rich,na,3059,12,radio/TV,good,1.0,1.0,4.0,2.0,5.0,1
16,53,male,2,own,na,na,2424,24,radio/TV,good,1.0,1.0,2.0,2.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,37,male,1,own,na,na,3565,12,education,good,1.0,1.0,2.0,2.0,3.0,1
991,34,male,1,own,moderate,na,1569,15,radio/TV,good,1.0,1.0,1.0,2.0,5.0,1
994,50,male,2,own,na,na,2390,12,car,good,1.0,1.0,2.0,2.0,1.0,1
995,31,female,1,own,little,na,1736,12,furniture/equipment,good,0.0,1.0,0.0,2.0,4.0,1


In [70]:
x = df_credit2.drop(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Risk', 'risk_code'], axis=1)
y = df_credit2['risk_code']

In [71]:
x

Unnamed: 0,Age,Job,Credit amount,Duration,sex_code,housing_code,savings_acc,checking_acc,purpose_code
0,67,2,1169,6,1.0,1.0,2.0,0.0,5.0
1,22,2,5951,48,0.0,1.0,0.0,1.0,5.0
2,49,1,2096,12,1.0,1.0,0.0,2.0,3.0
3,45,2,7882,42,1.0,0.0,0.0,0.0,4.0
4,53,2,4870,24,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
995,31,1,1736,12,0.0,1.0,0.0,2.0,4.0
996,40,3,3857,30,1.0,1.0,0.0,0.0,1.0
997,38,2,804,12,1.0,1.0,0.0,2.0,5.0
998,23,2,1845,45,1.0,0.0,0.0,0.0,5.0


In [72]:
y

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: risk_code, Length: 1000, dtype: int64

In [73]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [74]:
clf = LogisticRegression(random_state=42, tol=1e-5)
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
predicted = clf.predict(x_test)
result = confusion_matrix(y_test, predicted)
print(result)
accuracy = accuracy_score(y_test, predicted)
print("accuracy score: " + accuracy.astype(str))
report = classification_report(y_test, predicted)
print(report)

[[ 25  47]
 [ 17 161]]
accuracy score: 0.744
              precision    recall  f1-score   support

           0       0.60      0.35      0.44        72
           1       0.77      0.90      0.83       178

    accuracy                           0.74       250
   macro avg       0.68      0.63      0.64       250
weighted avg       0.72      0.74      0.72       250

