In [1]:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import scipy

## 1 网格搜索
暴力寻找, 用于离散值的选取

In [2]:
def test_GridSearchCV():
    '''
    使用 LogisticRegression 作为分类器，
    主要优化 C、penalty、multi_class 等参数
    '''
    ### 加载数据
    digits = load_digits()
    X_train, X_test, y_train, y_test = train_test_split(digits.data, 
                                                        digits.target, 
                                                        test_size=0.25,
                                                        random_state=0, 
                                                        stratify=digits.target)
    #### 参数优化 ######
    tuned_parameters = [{'penalty': ['l1','l2'],
                        'C': [0.01, 0.1, 1, 10, 100],
                        'solver':['liblinear'],
                        'multi_class': ['ovr']
                        },
                        # 因为 lbfgs supports only l2 penalties, and
                        # Solver liblinear does not support a multinomial backend
                        # 所以分开
                        {'penalty': ['l2'],
                        'C': [0.01, 0.1, 1, 10, 100],
                         'solver':['lbfgs'],
                        'multi_class': ['ovr', 'multinomial']
                        }
                       ]
    clf = GridSearchCV(LogisticRegression(tol=1e-6), tuned_parameters, cv=10)
    clf.fit(X_train, y_train)
    
    print("最优的参数时:", clf.best_params_)
    print("参数网格选择过程:")
    for params, mean_score, scores in clf.grid_scores_:
             print("%0.3f (+/-%0.03f) for %s" % (mean_score, 
                                                   scores.std() * 2, 
                                                   params))

    print("\n最优的结果:",clf.score(X_test,y_test))
    y_true, y_pred = y_test, clf.predict(X_test)
    print("\n详细的分类报告:")
    print(classification_report(y_true, y_pred))
    
test_GridSearchCV()

最优的参数时: {'C': 0.01, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}
参数网格选择过程:
0.930 (+/-0.031) for {'C': 0.01, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
0.965 (+/-0.022) for {'C': 0.01, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}
0.965 (+/-0.024) for {'C': 0.1, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
0.964 (+/-0.023) for {'C': 0.1, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}
0.951 (+/-0.025) for {'C': 1, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
0.953 (+/-0.028) for {'C': 1, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}
0.950 (+/-0.020) for {'C': 10, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
0.951 (+/-0.023) for {'C': 10, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}
0.944 (+/-0.018) for {'C': 100, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
0.941 (+/-0.028) for {'C': 100, 'multi_class': 'ovr', 'penalty



## 2 随机搜索

In [5]:
def test_RandomizedSearchCV():
    '''
    使用 LogisticRegression 作为分类器，
    主要优化 C、multi_class 等参数。其中 C 的分布函数为指数分布
    '''
    # 加载数据
    digits = load_digits()
    X_train, X_test, y_train, y_test = train_test_split(digits.data, 
                                                        digits.target, 
                                                        test_size=0.25,
                                                        random_state=0, 
                                                        stratify=digits.target)
    # 参数优化
    tuned_parameters = {'C': scipy.stats.expon(scale=100), # 指数分布
                        'multi_class': ['ovr','multinomial']}
    
    clf = RandomizedSearchCV(LogisticRegression(penalty='l2', solver='lbfgs', 
                                                tol=1e-6),
                             tuned_parameters,
                             cv=10,
                             scoring="accuracy",
                             n_iter=10)
    clf.fit(X_train, y_train)
    print("最好的参数集是:",clf.best_params_)
    print("\n随机搜索得分:")
    for params, mean_score, scores in clf.grid_scores_:
        print("\t%.3f(+/-%.3f)for %s" % (mean_score, scores.std() * 2, params))

    print("最优得分:",clf.score(X_test,y_test))
    
    y_true, y_pred = y_test, clf.predict(X_test)
    print("\n详细的报告如下:")
    print(classification_report(y_true, y_pred))

test_RandomizedSearchCV()

最好的参数集是: {'C': 91.94221825304501, 'multi_class': 'multinomial'}

随机搜索得分:
	0.959(+/-0.028)for {'C': 91.94221825304501, 'multi_class': 'multinomial'}
	0.958(+/-0.027)for {'C': 22.04092403303305, 'multi_class': 'multinomial'}
	0.948(+/-0.026)for {'C': 22.640872194884558, 'multi_class': 'ovr'}
	0.940(+/-0.028)for {'C': 83.6915216023639, 'multi_class': 'ovr'}
	0.936(+/-0.031)for {'C': 146.85132991341237, 'multi_class': 'ovr'}
	0.959(+/-0.028)for {'C': 2.917238704497741, 'multi_class': 'multinomial'}
	0.937(+/-0.029)for {'C': 186.57941258547095, 'multi_class': 'ovr'}
	0.957(+/-0.024)for {'C': 187.84857609167884, 'multi_class': 'multinomial'}
	0.941(+/-0.027)for {'C': 50.021436034070945, 'multi_class': 'ovr'}
	0.938(+/-0.030)for {'C': 102.94859913914553, 'multi_class': 'ovr'}
最优得分: 0.9644444444444444

详细的报告如下:
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        45
          1       0.88      0.98      0.93        46
          2       1.00   



In [4]:
scipy.stats.expon(scale=100)

<scipy.stats._distn_infrastructure.rv_frozen at 0x1375f74a358>