In [110]:
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics.cluster import normalized_mutual_info_score

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier

In [22]:
with open('german.data-numeric') as f:
    lines = f.readlines()
    
data = [line.split() for line in lines]
data = np.array(data).astype('int32')

array([[ 1,  6,  4, ...,  0,  1,  1],
       [ 2, 48,  2, ...,  0,  1,  2],
       [ 4, 12,  4, ...,  1,  0,  1],
       ...,
       [ 4, 12,  2, ...,  0,  1,  1],
       [ 1, 45,  2, ...,  0,  1,  2],
       [ 2, 45,  4, ...,  0,  1,  1]])

In [37]:
X = data[:,:-1]
y = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X.shape, y.shape)

(1000, 24) (1000,)


In [26]:
#these are the labels
np.unique(y)

array([1, 2])

In [50]:
# apply SVM
clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X_train, y_train)

print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.8642857142857143
 test accuracy: 0.7533333333333333
 recall: 0.880184331797235
 precision: 0.799163179916318 
 f score: 0.8377192982456141


In [64]:
#Decision Tree
clf = tree.DecisionTreeClassifier(max_depth = 2)
clf.fit(X_train, y_train)

print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.7228571428571429
 test accuracy: 0.7333333333333333
 recall: 0.9631336405529954
 precision: 0.7437722419928826 
 f score: 0.8393574297188756


In [107]:
# k-means
for i in range(2,11):
    print('##### number of clusters: ',i," ########")
    kmeans = KMeans(n_clusters=i).fit(X)
    for k in range(i):
        print('cluster: ', k+1)
        b = y[np.where(kmeans.labels_ == k)[0]]
        for j in [1,2]:
            count = np.count_nonzero(b==j)
            print(j,": ", count)

##### number of clusters:  2  ########
cluster:  1
1 :  103
2 :  77
cluster:  2
1 :  597
2 :  223
##### number of clusters:  3  ########
cluster:  1
1 :  529
2 :  183
cluster:  2
1 :  143
2 :  86
cluster:  3
1 :  28
2 :  31
##### number of clusters:  4  ########
cluster:  1
1 :  209
2 :  92
cluster:  2
1 :  393
2 :  134
cluster:  3
1 :  17
2 :  25
cluster:  4
1 :  81
2 :  49
##### number of clusters:  5  ########
cluster:  1
1 :  280
2 :  116
cluster:  2
1 :  83
2 :  49
cluster:  3
1 :  185
2 :  81
cluster:  4
1 :  17
2 :  25
cluster:  5
1 :  135
2 :  29
##### number of clusters:  6  ########
cluster:  1
1 :  142
2 :  28
cluster:  2
1 :  43
2 :  38
cluster:  3
1 :  276
2 :  115
cluster:  4
1 :  12
2 :  22
cluster:  5
1 :  62
2 :  24
cluster:  6
1 :  165
2 :  73
##### number of clusters:  7  ########
cluster:  1
1 :  137
2 :  27
cluster:  2
1 :  40
2 :  37
cluster:  3
1 :  250
2 :  106
cluster:  4
1 :  13
2 :  22
cluster:  5
1 :  61
2 :  22
cluster:  6
1 :  54
2 :  39
cluster:  7
1 :  1

In [108]:
# the k-means does not work here

In [155]:
# Neural network
clf = MLPClassifier(hidden_layer_sizes = (10)).fit(X_train, y_train)
print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.7571428571428571
 test accuracy: 0.7733333333333333
 recall: 0.8617511520737328
 precision: 0.8311111111111111 
 f score: 0.8461538461538461




In [112]:
# Neural network
clf = MLPClassifier().fit(X_train, y_train)
print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.8285714285714286
 test accuracy: 0.78
 recall: 0.8847926267281107
 precision: 0.8240343347639485 
 f score: 0.8533333333333334




In [166]:
# Neural network
clf = MLPClassifier(hidden_layer_sizes = (64,64), tol = 1e-100, n_iter_no_change = 200).fit(X_train, y_train)
print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.8628571428571429
 test accuracy: 0.7533333333333333
 recall: 0.8755760368663594
 precision: 0.8016877637130801 
 f score: 0.8370044052863436




In [134]:
# Neural network
clf = MLPClassifier(hidden_layer_sizes = (128), tol = 1e-10).fit(X_train, y_train)
print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.8328571428571429
 test accuracy: 0.7966666666666666
 recall: 0.8571428571428571
 precision: 0.8611111111111112 
 f score: 0.859122401847575




In [157]:
# Neural network
clf = MLPClassifier(hidden_layer_sizes = (64,128,128,64,32), tol = 1e-100,n_iter_no_change = 60).fit(X_train, y_train)
print(' train accuracy: {}\n test accuracy: {}\n recall: {}\n precision: {} \n f score: {}'.format(
accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)),
recall_score(y_test, clf.predict(X_test)),
precision_score(y_test, clf.predict(X_test)),
f1_score(y_test, clf.predict(X_test))
))

 train accuracy: 0.9428571428571428
 test accuracy: 0.7833333333333333
 recall: 0.8525345622119815
 precision: 0.8486238532110092 
 f score: 0.8505747126436782


