In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as cls
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

dataset = load_digits()
X,y = dataset.data, dataset.target
for class_name, class_count in zip(dataset.target_names,np.bincount(dataset.target)):
    print(class_name, class_count)
# zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。
# 如果各个迭代器的元素个数不一致，则返回列表长度与最短的对象相同，利用 * 号操作符，可以将元组解压为列表。
# np.bincount() Count number of occurrences of each value in array of non-negative ints.
# 显然是这里从0开始计算出现次数的，所以最后一个是最大值: 0出现几次，1出现几次，一直到9出现几次

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [27]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced !=1] = 0
print('Orinal labels:    ',y[:30])
print('New binary labels:',y_binary_imbalanced[:30])

Orinal labels:     [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
New binary labels: [0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [28]:
np.bincount(y_binary_imbalanced)

array([1615,  182])

In [29]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y_binary_imbalanced,random_state=0)
svm = SVC(kernel='rbf',C=1).fit(X_train,y_train)
svm.score(X_test,y_test)



0.9088888888888889

## Dummy Classifiers

In [30]:
from sklearn.dummy import DummyClassifier
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train,y_train)
y_dummy_predictions = dummy_majority.predict(X_test)
y_dummy_predictions
# most_frequent: 预测值是出现频率最高的类别
dummy_majority.score(X_test,y_test)

0.9044444444444445

In [31]:
svm = SVC(kernel='linear',C=1).fit(X_train,y_train)
svm.score(X_test,y_test)

0.9777777777777777

## confusion matrix

### binary confusion marix

In [32]:
from sklearn.metrics import confusion_matrix
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train,y_train)
y_dummy_predictions = dummy_majority.predict(X_test)
confusion = confusion_matrix(y_test,y_dummy_predictions)
print('most frequent class (dummy classifier)\n',confusion)

most frequent class (dummy classifier)
 [[407   0]
 [ 43   0]]


In [33]:
dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train,y_train)
y_classprop_predicted = dummy_classprop.predict(X_test)
confusion = confusion_matrix(y_test,y_classprop_predicted)
print('random class-proportional prediction (dummy classifier)\n',confusion)

random class-proportional prediction (dummy classifier)
 [[372  35]
 [ 38   5]]


In [34]:
svm = SVC(kernel='linear',C=1).fit(X_train,y_train)
svm_predicted = svm.predict(X_test)
confusion = confusion_matrix(y_test,svm_predicted)
print('SVM classifier (linear kernal, C=1)\n',confusion)

SVM classifier (linear kernal, C=1)
 [[402   5]
 [  5  38]]


In [35]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train,y_train)
lr_predicted = lr.predict(X_test)
confusion = confusion_matrix(y_test,lr_predicted)
print('logistic regression (defalt settings)\n',confusion)

logistic regression (defalt settings)
 [[401   6]
 [  6  37]]




In [36]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=2).fit(X_train,y_train)
tree_predicted = dt.predict(X_test)
confusion = confusion_matrix(y_test,tree_predicted)
print('Decision Tree Classifier(max_depth=2)\n',confusion)

Decision Tree Classifier(max_depth=2)
 [[400   7]
 [ 17  26]]
