In [83]:
import numpy as np
import urllib.request
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from scipy.stats import uniform as sp_rand

In [41]:
#代表性函数使用介绍
#(1)加载数据
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
#下载数据
#语法没问题，此网站无法访问
#raw_data = urllib.request.urlopen(url)
#直接读入鸢尾花的数据
dataset = pd.read_csv('iris.data', header=None)
dataset[4] = pd.Categorical(dataset[4]).codes
print(dataset)
x = dataset.values[:,:4]
y = dataset.values[:,4]
print(x)
print(y)
#数据归一化
normalized_X = preprocessing.normalize(x)
standardized_X = preprocessing.scale(x)
#特征选择
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_)

       0    1    2    3  4
0    5.1  3.5  1.4  0.2  0
1    4.9  3.0  1.4  0.2  0
2    4.7  3.2  1.3  0.2  0
3    4.6  3.1  1.5  0.2  0
4    5.0  3.6  1.4  0.2  0
5    5.4  3.9  1.7  0.4  0
6    4.6  3.4  1.4  0.3  0
7    5.0  3.4  1.5  0.2  0
8    4.4  2.9  1.4  0.2  0
9    4.9  3.1  1.5  0.1  0
10   5.4  3.7  1.5  0.2  0
11   4.8  3.4  1.6  0.2  0
12   4.8  3.0  1.4  0.1  0
13   4.3  3.0  1.1  0.1  0
14   5.8  4.0  1.2  0.2  0
15   5.7  4.4  1.5  0.4  0
16   5.4  3.9  1.3  0.4  0
17   5.1  3.5  1.4  0.3  0
18   5.7  3.8  1.7  0.3  0
19   5.1  3.8  1.5  0.3  0
20   5.4  3.4  1.7  0.2  0
21   5.1  3.7  1.5  0.4  0
22   4.6  3.6  1.0  0.2  0
23   5.1  3.3  1.7  0.5  0
24   4.8  3.4  1.9  0.2  0
25   5.0  3.0  1.6  0.2  0
26   5.0  3.4  1.6  0.4  0
27   5.2  3.5  1.5  0.2  0
28   5.2  3.4  1.4  0.2  0
29   4.7  3.2  1.6  0.2  0
..   ...  ...  ...  ... ..
120  6.9  3.2  5.7  2.3  2
121  5.6  2.8  4.9  2.0  2
122  7.7  2.8  6.7  2.0  2
123  6.3  2.7  4.9  1.8  2
124  6.7  3.3  5.7  2.1  2
1



In [48]:
#机器学习算法的使用
#(1)逻辑回归
model = LogisticRegression()
model.fit(x,y)
print(model)
#进行预测
expected = y
predicted = model.predict(x)
print("分类", metrics.classification_report(expected,predicted))
print("结果", metrics.confusion_matrix(expected,predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
分类               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        50
         1.0       0.98      0.90      0.94        50
         2.0       0.91      0.98      0.94        50

   micro avg       0.96      0.96      0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150

结果 [[50  0  0]
 [ 0 45  5]
 [ 0  1 49]]




In [51]:
#(2)朴素贝叶斯
model = GaussianNB()
model.fit(x,y)
print(model)
expected = y
predicted = model.predict(x)
print("分类", metrics.classification_report(expected,predicted))
print("结果", metrics.confusion_matrix(expected,predicted))

GaussianNB(priors=None, var_smoothing=1e-09)
分类               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        50
         1.0       0.94      0.94      0.94        50
         2.0       0.94      0.94      0.94        50

   micro avg       0.96      0.96      0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150

结果 [[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]


In [54]:
#(3)K邻近
model = KNeighborsClassifier()
model.fit(x,y)
print(model)
expected = y
predicted = model.predict(x)
print("分类", metrics.classification_report(expected,predicted))
print("结果", metrics.confusion_matrix(expected,predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
分类               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        50
         1.0       0.96      0.94      0.95        50
         2.0       0.94      0.96      0.95        50

   micro avg       0.97      0.97      0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150

结果 [[50  0  0]
 [ 0 47  3]
 [ 0  2 48]]


In [58]:
#(4)决策树
model = DecisionTreeClassifier()
model.fit(x,y)
print(model)
expected = y
predicted = model.predict(x)
print("分类", metrics.classification_report(expected,predicted))
print("结果", metrics.confusion_matrix(expected,predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
分类               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        50
         1.0       1.00      1.00      1.00        50
         2.0       1.00      1.00      1.00        50

   micro avg       1.00      1.00      1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

结果 [[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]


In [61]:
#(5)支持向量机
model = SVC()
model.fit(x,y)
print(model)
expected = y
predicted = model.predict(x)
print("分类", metrics.classification_report(expected,predicted))
print("结果", metrics.confusion_matrix(expected,predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
分类               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        50
         1.0       1.00      0.96      0.98        50
         2.0       0.96      1.00      0.98        50

   micro avg       0.99      0.99      0.99       150
   macro avg       0.99      0.99      0.99       150
weighted avg       0.99      0.99      0.99       150

结果 [[50  0  0]
 [ 0 48  2]
 [ 0  0 50]]




In [91]:
#优化算法参数
aplhas = np.array([1,0.1,0.01,0.001,0.0001,0])
model = Ridge()
#GridSearchCV网络搜索交叉验证
grid = GridSearchCV(estimator=model,param_grid=dict(aplha=aplhas))
#grid.fit(x, y)
#print(grid)
#best_score_已经没有了，
# print(grid.best_score_)
print(grid.estimator.alpha)

1.0


In [89]:
#有时随机从给定区间中选择参数是很有效的方法，然后根据这些参数来评估算法的效果进而选择最佳的那个
model = Ridge()
param_grid = {'alpha': sp_rand()}
research = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
research.fit(x,y)
print(research)
print(research.best_score_)
print(research.best_estimator_.alpha)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000211047070F0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
0.0
0.8179383210617166
