In [1]:
# 引入工具库
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib as mpl
import numpy as np
from matplotlib import pyplot as plt
mpl.rcParams['legend.numpoints'] = 1

## 粗略流程

In [3]:
# 基本建模流程
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 构建数据集
X, y = make_blobs(random_state=0)
# 切分train和test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# 初始化模型对象并拟合
logreg = LogisticRegression().fit(X_train, y_train)
# 模型评估
logreg.score(X_test, y_test)

0.88

## k折交叉验证

In [4]:
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
logreg = LogisticRegression()

scores = cross_val_score(logreg, iris.data, iris.target)
print("cross-validation scores: ", scores)

cross-validation scores:  [0.96078431 0.92156863 0.95833333]


In [9]:
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
scores

array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ])

In [10]:
scores.mean()

0.9600000000000002

### 手动指定k折切分

In [11]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)


array([1.        , 0.93333333, 0.43333333, 0.96666667, 0.43333333])

In [18]:
kfold = KFold(n_splits=3)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)

array([0., 0., 0.])

In [19]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)

array([0.9 , 0.96, 0.96])

### 留一法交叉验证

In [20]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
print("number of cv iterations: ", len(scores))
print("mean accuracy: ", scores.mean())

number of cv iterations:  150
mean accuracy:  0.9533333333333334


### 乱序切分

In [22]:
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)

array([0.78666667, 0.96      , 0.92      , 0.85333333, 0.93333333,
       0.90666667, 0.97333333, 0.90666667, 0.90666667, 0.97333333])

### 分层抽样，保证每一折上的各类样本的比例一致

In [25]:
from sklearn.model_selection import StratifiedKFold
from data.tools.datasets import make_blobs
# 构建数据集
X, y = make_blobs(n_samples=12, random_state=0)
# 敲定一组label，做分层抽样交叉验证
labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
cross_val_score(logreg, X, y, labels, cv=StratifiedKFold(n_splits=3))

ModuleNotFoundError: No module named 'data'

## 网格搜索

### 手动遍历超参数（训练集 + 验证集）

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
# 训练集+测试集
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0)
# 真正的训练集+验证集
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)

print("训练集数据量: %d，验证集数据量: %d，测试集数据量: %d" % (X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # 评估
        score = svm.score(X_valid, y_valid)
        # 保留最高得分
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

# 在测试数据上评估
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("验证集上最高得分: ", best_score)
print("最佳参数: ", best_parameters)
print("验证集选出最好的参数上测试集的得分为: ", test_score)

训练集数据量: 84，验证集数据量: 28，测试集数据量: 38
验证集上最高得分:  0.9642857142857143
最佳参数:  {'C': 10, 'gamma': 0.001}
验证集选出最好的参数上测试集的得分为:  0.9210526315789473


### GridSearchCV = grid_search(产出候选超参数) + cross_validation(评估方式)

In [27]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
param_grid

# 超参数1：5种取值
# 超参数2：3种取值
# 超参数3：6种取值
# 5-fold交叉验证，要建多少次模型用于评估？
# 5*3*6*5 + 1

# 有加速的方法吗？ （并行化、加资源...）

# depth:[3,5,7,10]
# min_child:[20,50,100]
# lr:[0.01,0.1,1,10]

# 7 50 0.1
# 周边搜索

# [10,20,10]

{'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [28]:
# param_grid是参数列表
# GridSearchCV是网格搜索交叉验证对象，fit之后可以对参数列表中的参数组进行拟合和交叉验证评估
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

import warnings
warnings.filterwarnings("ignore")

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
# 最好的超参数 和 最高得分
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 100, 'gamma': 0.01}
0.9732142857142857


In [31]:
grid_search.score(X_test, y_test)

0.9736842105263158

在有一些问题中，我们不能直接对数据进行随机切分，比如分类问题中，如果类别是不均衡的(非1:1)，我们不能直接随机切分，更多的情况下，我们会手动切分，并且保证每个fold中的样本比例一致。