In [1]:
from sklearn.datasets import load_iris 
dataset = load_iris() 
X = dataset.data 
y = dataset.target 

In [2]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [3]:
X.shape

(150, 4)

In [4]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [5]:
X[-5:]

array([[6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [6]:
import numpy as np
# Compute the mean for each attribute计算列平均值
attribute_means = X.mean(axis=0)
X_d = np.array(X >= attribute_means, dtype='int')

In [7]:
attribute_means

array([5.84333333, 3.05733333, 3.758     , 1.19933333])

In [8]:
X_d.shape

(150, 4)

In [9]:
X_d[:5]

array([[0, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

In [10]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split

# 设置随机数种子以便复现书里的内容
random_state = 14
# sklearn.model_selection.train_test_split \
# (*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
# train_size defaut None = 25% for testing
X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("训练集数据有 {} 条".format(y_train.shape))
print("测试集数据有 {} 条".format(y_test.shape))

训练集数据有 (112,) 条
测试集数据有 (38,) 条


In [11]:
from collections import defaultdict
from operator import itemgetter


def train(X, y_true, feature):
    """Computes the predictors and error for a given feature using the OneR algorithm
    
    Parameters
    ----------
    X: array [n_samples, n_features]
        The two dimensional array that holds the dataset. Each row is a sample, each column
        is a feature.
    
    y_true: array [n_samples,]
        The one dimensional array that holds the class values. Corresponds to X, such that
        y_true[i] is the class value for sample X[i].
    
    feature: int
        An integer corresponding to the index of the variable we wish to test.
        0 <= variable < n_features
        
    Returns
    -------
    predictors: dictionary of tuples: (value, prediction)
        For each item in the array, if the variable has a given value, make the given prediction.
    
    error: float
        The ratio of training data that this rule incorrectly predicts.
    """
    # 1.一些等下要用的变量（数据的形状如上）
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    values = set(X[:,feature])
    predictors = dict()
    errors = []
    
    # 2.算法（对照上面的算法流程）
    # 已经给定特征 feature，作为函数参数传过来了
    for current_value in values: 
    # For 该特征对应的真值（即植物是哪一类）
    
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value) 
        # 预测值：基于该特征预测的次数最多的类，即在所有样本里该特征 10 次有 6 次预测了 A 类，那我们对所有样本都预测为 A 类
        
        predictors[current_value] = most_frequent_class
        errors.append(error)
        # 计算预测值与真值的误差
    
    total_error = sum(errors)
    # 对上面计算的误差求和
    # python里求和函数 sum([1, 2, 3]) == 1 + 2 + 3 == 6
    
    return predictors, total_error

# Compute what our predictors say each sample is based on its value
#y_predicted = np.array([predictors[sample[feature]] for sample in X])
    
#下面创建函数声明，参数分别是数据集、类别数组、选好的特征索引值、特征值
def train_feature_value(X, y_true, feature, value):
    # 预测值：基于该特征预测的次数最多的类，即在所有样本里该特征 10 次有 6 次预测了 A 类，那我们对所有样本都预测为 A 类
    # 我们需要一个字典型变量存每个变量预测正确的次数
    class_counts = defaultdict(int)
    # 对每个二元组（类别，真值）迭代计数
    for sample, y in zip(X, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
    # 现在选被预测最多的类别，需要排序。（我们认为被预测最多的类别就是正确的）
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    # 误差定义为分类“错误”的次数，这里“错误”指样本中没有分类为我们预测的值，即样本的真实类别不是“被预测最多的类别”
    n_samples = X.shape[1]
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])
    return most_frequent_class, error

In [12]:
X[:,0]

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

In [13]:
set(X_train[:,0])

{0, 1}

In [14]:
# For 给定的每个特征，计算所有预测值（这里 for 写到 list 里面是 python 的语法糖）
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# 现在选择最佳模型并保存为 "model"
# 按误差排序
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("最佳模型基于第 {0} 个变量，误差为 {1:.2f}".format(best_variable, best_error))

# 选最好的模型，也就是误差最小的模型
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)

最佳模型基于第 2 个变量，误差为 37.00
{'variable': 2, 'predictor': {0: 0, 1: 2}}


In [15]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

In [16]:
y_predicted = predict(X_test, model)
print(y_predicted)
accuracy = np.mean(y_predicted == y_test) * 100
print("在测试集上的准确率 {:.1f}%".format(accuracy))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]
在测试集上的准确率 65.8%
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        17
           1       0.00      0.00      0.00        13
           2       0.40      1.00      0.57         8

    accuracy                           0.66        38
   macro avg       0.45      0.67      0.51        38
weighted avg       0.51      0.66      0.55        38



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
X.shape[0]

150

In [18]:
errors

{0: 41, 1: 58, 2: 37, 3: 37}

In [19]:
all_predictors

{0: ({0: 0, 1: 2}, 41),
 1: ({0: 1, 1: 0}, 58),
 2: ({0: 0, 1: 2}, 37),
 3: ({0: 0, 1: 2}, 37)}

In [20]:
train(X_train, y_train, 0)

({0: 0, 1: 2}, 41)

In [21]:
len(set(X[:,0]))

35

In [22]:
train_feature_value(X_train, y_train, 0, 0)

(0, 19)

In [23]:
train_feature_value(X_train, y_train, 0, 1)

(2, 22)

In [24]:
train_feature_value(X_train, y_train, 1, 0)

(1, 35)

In [68]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=random_state)

In [26]:
# For 给定的每个特征，计算所有预测值（这里 for 写到 list 里面是 python 的语法糖）
all_predictors2 = {variable: train(X_train2, y_train2, variable) for variable in range(X_train2.shape[1])}
errors2 = {variable: error for variable, (mapping, error) in all_predictors2.items()}
# 现在选择最佳模型并保存为 "model"
# 按误差排序
best_variable2, best_error2 = sorted(errors2.items(), key=itemgetter(1))[0]
print("最佳模型基于第 {0} 个变量，误差为 {1:.2f}".format(best_variable2, best_error2))

# 选最好的模型，也就是误差最小的模型
model2 = {'variable': best_variable2,
         'predictor': all_predictors2[best_variable2][0]}
print(model2)

最佳模型基于第 3 个变量，误差为 5.00
{'variable': 3, 'predictor': {0.2: 0, 1.5: 1, 1.2: 1, 1.8: 2, 0.4: 0, 1.9: 2, 1.4: 1, 1.3: 1, 2.0: 2, 2.3: 2, 2.4: 2, 1.0: 1, 2.5: 2, 0.6: 0, 1.6: 1, 1.1: 1, 2.1: 2, 1.7: 1, 0.1: 0, 2.2: 2, 0.3: 0}}


In [27]:
len(model2['predictor'])

21

In [28]:
len(set(X[:,3]))

22

In [29]:
set(X_train2[:,3])

{0.1,
 0.2,
 0.3,
 0.4,
 0.6,
 1.0,
 1.1,
 1.2,
 1.3,
 1.4,
 1.5,
 1.6,
 1.7,
 1.8,
 1.9,
 2.0,
 2.1,
 2.2,
 2.3,
 2.4,
 2.5}

In [30]:
set(X_test2[:,3])

{0.2, 0.3, 0.4, 0.5, 1.0, 1.1, 1.2, 1.3, 1.5, 1.7, 1.8, 2.0, 2.1, 2.3}

In [31]:
test2modify = np.column_stack((X_test2, y_test2))[np.column_stack((X_test2, y_test2))[:,3]!=0.5]

In [32]:
X_test2 = test2modify[:,:4]

In [33]:
set(X_test2[:,3])

{0.2, 0.3, 0.4, 1.0, 1.1, 1.2, 1.3, 1.5, 1.7, 1.8, 2.0, 2.1, 2.3}

In [34]:
y_test2 = test2modify[:,4]

In [35]:
X_test2.shape

(37, 4)

In [36]:
def predict2(X_test2, model2):
    variable = model2['variable']
    predictor = model2['predictor']
    y_predicted = np.array([predictor[(sample[variable])] for sample in X_test2])
    return y_predicted

In [37]:
y_predicted2 = predict2(X_test2, model2)
print(y_predicted2)
accuracy2 = np.mean(y_predicted2 == y_test2) * 100
print("在测试集上的准确率 {:.2f}%".format(accuracy2))
from sklearn.metrics import classification_report
print(classification_report(y_test2, y_predicted2))

[0 0 0 1 2 1 0 1 1 1 0 2 2 0 1 0 2 2 1 0 0 0 1 0 2 0 1 1 0 0 1 1 0 1 0 2 1]
在测试集上的准确率 97.30%
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        16
         1.0       0.93      1.00      0.96        13
         2.0       1.00      0.88      0.93         8

    accuracy                           0.97        37
   macro avg       0.98      0.96      0.97        37
weighted avg       0.97      0.97      0.97        37



In [38]:
print(type(errors2))

<class 'dict'>


In [41]:
errors

{0: 41, 1: 58, 2: 37, 3: 37}

In [43]:
all_predictors.items()

dict_items([(0, ({0: 0, 1: 2}, 41)), (1, ({0: 1, 1: 0}, 58)), (2, ({0: 0, 1: 2}, 37)), (3, ({0: 0, 1: 2}, 37))])

In [85]:
x = np.column_stack((X_test2, y_test2))

In [107]:
y = [x[i] for i in range(len(x[:,3])) if x[:,3][i] in X_train2[:,3]]

In [111]:
y = np.array(y)