In [203]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import OrdinalEncoder
from lime.lime_tabular import LimeTabularExplainer

In [204]:
attribute_names = [
    "Status of existing checking account",
    "Duration in month",
    "Credit history",
    "Purpose",
    "Credit amount",
    "Savings account/bonds",
    "Present employment since",
    "Installment rate in percentage of disposable income",
    "Personal status and sex",
    "Other debtors / guarantors",
    "Present residence since",
    "Property",
    "Age in years",
    "Other installment plans",
    "Housing",
    "Number of existing credits at this bank",
    "Job",
    "Number of people being liable to provide maintenance for",
    "Telephone",
    "foreign worker",
    "Credit status"
]

In [205]:
df = pd.read_csv("~/JupyterNotebooks/timskiProekt/german.data", header=None,delim_whitespace=True, names=attribute_names)

In [206]:
df.head()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Credit status
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [207]:
for label in [
    "Status of existing checking account",
    "Credit history",
    "Purpose",
    "Savings account/bonds",
    "Present employment since",
    "Personal status and sex",
    "Other debtors / guarantors",
    "Property",
    "Other installment plans",
    "Housing",
    "Job",
    "Telephone",
    "foreign worker",
]:
    inx = attribute_names.index(label)
    df[label] = df[label].transform(lambda x: x.split(f"A{inx+1}")[1])
df.head()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Credit status
0,1,6,4,3,1169,5,5,4,3,1,...,1,67,3,2,2,3,1,2,1,1
1,2,48,2,3,5951,1,3,2,2,1,...,1,22,3,2,1,3,1,1,1,2
2,4,12,4,6,2096,1,4,2,3,1,...,1,49,3,2,1,2,2,1,1,1
3,1,42,2,2,7882,1,4,2,3,3,...,2,45,3,3,1,3,2,1,1,1
4,1,24,3,0,4870,1,3,3,3,1,...,4,53,3,3,2,3,2,1,1,2


In [208]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2,random_state=0)

In [209]:
lda_model = LinearDiscriminantAnalysis()
lda_preds = lda_model.fit(X_train, y_train).predict(X_test)
lda_acc = accuracy_score(y_test, lda_preds)
lda_f1 = f1_score(y_test, lda_preds)
lda_confusion_matrix = confusion_matrix(y_test, lda_preds, labels=[1, 2])
print('LDA Accuracy: {}'.format(lda_acc))
print('LDA F1 score: {}'.format(lda_f1))
print('LDA Confusion matrix: \n{}'.format(lda_confusion_matrix))

LDA Accuracy: 0.745
LDA F1 score: 0.8210526315789473
LDA Confusion matrix: 
[[117  25]
 [ 26  32]]


In [210]:
qda_model = QuadraticDiscriminantAnalysis()
qda_preds = qda_model.fit(X_train, y_train).predict(X_test)
qda_acc = accuracy_score(y_test,qda_preds)
qda_f1 = f1_score(y_test, qda_preds)
qda_confusion_matrix = confusion_matrix(y_test, qda_preds, labels=[1, 2])
print('QDA Accuracy: {}'.format(qda_acc))
print('QDA F1 score: {}'.format(qda_f1))
print('QDA Confusion matrix: \n{}'.format(qda_confusion_matrix))

QDA Accuracy: 0.725
QDA F1 score: 0.795539033457249
QDA Confusion matrix: 
[[107  35]
 [ 20  38]]


In [211]:
logreg_model = LogisticRegression(max_iter=1000)
logreg_preds = logreg_model.fit(X_train, y_train).predict(X_test)
logreg_acc = accuracy_score(y_test, logreg_preds)
logreg_f1 = f1_score(y_test, logreg_preds)
logreg_confusion_matrix = confusion_matrix(y_test, logreg_preds, labels=[1, 2])
print('Logistic Regression Accuracy: {}'.format(logreg_acc))
print('Logistic Regression F1 score: {}'.format(logreg_f1))
print('Logistic Regression Confusion matrix: \n{}'.format(logreg_confusion_matrix))

Logistic Regression Accuracy: 0.735
Logistic Regression F1 score: 0.81786941580756
Logistic Regression Confusion matrix: 
[[119  23]
 [ 30  28]]


In [212]:
dtc_model = DecisionTreeClassifier()
dtc_preds = dtc_model.fit(X_train,y_train).predict(X_test)
dtc_acc = accuracy_score(y_test, dtc_preds)
dtc_f1 = f1_score(y_test, dtc_preds)
dtc_confusion_matrix = confusion_matrix(y_test, dtc_preds, labels=[1, 2])
print('Decision Tree Accuracy: {}'.format(dtc_acc))
print('Decision Tree F1 score: {}'.format(dtc_f1))
print('Decision Tree Confusion matrix: \n{}'.format(dtc_confusion_matrix))

Decision Tree Accuracy: 0.695
Decision Tree F1 score: 0.779783393501805
Decision Tree Confusion matrix: 
[[108  34]
 [ 27  31]]


In [213]:
sm = SMOTENC(sampling_strategy='minority', random_state=7,categorical_features=[0,2,3,5,6,8,9,11,13,14,16,18,19])

oversampled_X, oversampled_Y = sm.fit_resample(df.drop('Credit status', axis=1), df['Credit status'])
oversampled = pd.concat([ pd.DataFrame(oversampled_X), pd.DataFrame(oversampled_Y)], axis=1)
oversampled.columns = df.columns

In [214]:
oversampled.iloc[0:2000]

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Credit status
0,1,6,4,3,1169,5,5,4,3,1,...,1,67,3,2,2,3,1,2,1,1
1,2,48,2,3,5951,1,3,2,2,1,...,1,22,3,2,1,3,1,1,1,2
2,4,12,4,6,2096,1,4,2,3,1,...,1,49,3,2,1,2,2,1,1,1
3,1,42,2,2,7882,1,4,2,3,3,...,2,45,3,3,1,3,2,1,1,1
4,1,24,3,0,4870,1,3,3,3,1,...,4,53,3,3,2,3,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,1,30,2,3,3091,1,2,2,3,1,...,2,26,3,2,1,3,1,1,1,2
1396,1,10,2,3,757,1,3,3,2,1,...,1,51,3,2,1,3,1,1,1,2
1397,1,13,2,3,786,1,2,4,2,1,...,2,49,3,2,1,2,1,1,1,2
1398,2,36,4,2,5202,1,3,2,2,1,...,2,40,3,2,1,4,1,2,1,2


In [215]:
X_train, X_test, y_train, y_test = train_test_split(oversampled.iloc[:, :-1], oversampled.iloc[:, -1], test_size=0.2,random_state=0)

In [216]:
lda_model = LinearDiscriminantAnalysis()
lda_preds = lda_model.fit(X_train, y_train).predict(X_test)
lda_acc = accuracy_score(y_test, lda_preds)
lda_f1 = f1_score(y_test, lda_preds)
lda_confusion_matrix = confusion_matrix(y_test, lda_preds, labels=[1, 2])
print('LDA Accuracy: {}'.format(lda_acc))
print('LDA F1 score: {}'.format(lda_f1))
print('LDA Confusion matrix: \n{}'.format(lda_confusion_matrix))

LDA Accuracy: 0.7714285714285715
LDA F1 score: 0.7681159420289855
LDA Confusion matrix: 
[[106  35]
 [ 29 110]]


In [217]:
qda_model = QuadraticDiscriminantAnalysis()
qda_preds = qda_model.fit(X_train, y_train).predict(X_test)
qda_acc = accuracy_score(y_test,qda_preds)
qda_f1 = f1_score(y_test, qda_preds)
qda_confusion_matrix = confusion_matrix(y_test, qda_preds, labels=[1, 2])
print('QDA Accuracy: {}'.format(qda_acc))
print('QDA F1 score: {}'.format(qda_f1))
print('QDA Confusion matrix: \n{}'.format(qda_confusion_matrix))

QDA Accuracy: 0.7285714285714285
QDA F1 score: 0.7007874015748031
QDA Confusion matrix: 
[[ 89  52]
 [ 24 115]]


In [218]:
logreg_model = LogisticRegression(max_iter=1000)
logreg_preds = logreg_model.fit(X_train, y_train).predict(X_test)
logreg_acc = accuracy_score(y_test, logreg_preds)
logreg_f1 = f1_score(y_test, logreg_preds)
logreg_confusion_matrix = confusion_matrix(y_test, logreg_preds, labels=[1, 2])
print('Logistic Regression Accuracy: {}'.format(logreg_acc))
print('Logistic Regression F1 score: {}'.format(logreg_f1))
print('Logistic Regression Confusion matrix: \n{}'.format(logreg_confusion_matrix))

Logistic Regression Accuracy: 0.7714285714285715
Logistic Regression F1 score: 0.7746478873239437
Logistic Regression Confusion matrix: 
[[110  31]
 [ 33 106]]


In [219]:
dtc_model = DecisionTreeClassifier()
dtc_preds = dtc_model.fit(X_train,y_train).predict(X_test)
dtc_acc = accuracy_score(y_test, dtc_preds)
dtc_f1 = f1_score(y_test, dtc_preds)
dtc_confusion_matrix = confusion_matrix(y_test, dtc_preds, labels=[1, 2])
print('Decision Tree Accuracy: {}'.format(dtc_acc))
print('Decision Tree F1 score: {}'.format(dtc_f1))
print('Decision Tree Confusion matrix: \n{}'.format(dtc_confusion_matrix))

Decision Tree Accuracy: 0.725
Decision Tree F1 score: 0.7259786476868327
Decision Tree Confusion matrix: 
[[102  39]
 [ 38 101]]
