---
title: Sklearn基本语法
date: 2019-08-28
categories: [基本语法, Sklearn]
mathjax: false
---

## 简单范例

In [59]:
import warnings
warnings.filterwarnings('ignore')  
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy_score(y_test, y_pred)

0.631578947368421

## 数据分割

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

## 数据处理

### Standardization

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
scaler.transform(X_train)

array([[-0.91090798, -1.59775374],
       [-1.0271058 ,  0.08448757],
       [ 0.59966379, -1.59775374],
       [ 0.01867465, -0.96691325],
       [ 0.48346596, -0.33607276],
       [-1.25950146,  0.29476773],
       [-1.37569929,  0.71532806],
       [-0.79471015, -1.17719341],
       [-1.14330363,  0.71532806],
       [ 2.45882905,  1.55644871],
       [-0.79471015,  0.71532806],
       [-0.79471015,  1.34616854],
       [-0.21372101, -0.33607276],
       [ 0.83205945, -0.1257926 ],
       [-0.44611666,  1.76672887],
       [ 1.41304859,  0.29476773],
       [ 0.01867465, -0.54635292],
       [ 2.22643339, -0.96691325],
       [-0.32991883, -1.17719341],
       [ 0.13487248,  0.29476773],
       [-1.0271058 ,  1.13588838],
       [-1.49189712, -1.59775374],
       [ 0.59966379, -0.54635292],
       [-1.60809495, -0.33607276],
       [-0.91090798,  1.13588838],
       [ 1.64544425, -0.1257926 ],
       [ 0.25107031,  0.71532806],
       [ 0.48346596, -1.8080339 ],
       [ 1.8778399 ,

### Normalization

In [11]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer().fit(X_train)
scaler.transform(X_train)

array([[0.90849045, 0.41790561],
       [0.84507884, 0.53464171],
       [0.93935732, 0.34293997],
       [0.91250932, 0.4090559 ],
       [0.90580954, 0.42368511],
       [0.82659925, 0.56279098],
       [0.80417614, 0.59439106],
       [0.89792072, 0.44015722],
       [0.81602448, 0.57801734],
       [0.90116674, 0.43347261],
       [0.83205029, 0.5547002 ],
       [0.80942185, 0.58722762],
       [0.88799441, 0.45985425],
       [0.90795938, 0.41905818],
       [0.81067923, 0.58549055],
       [0.90947448, 0.41575976],
       [0.90055164, 0.43474907],
       [0.94744567, 0.31991672],
       [0.91036648, 0.41380294],
       [0.87903186, 0.47676304],
       [0.80588181, 0.59207643],
       [0.89043468, 0.45511106],
       [0.91381155, 0.40613847],
       [0.8349582 , 0.55031336],
       [0.81153434, 0.58430473],
       [0.92307692, 0.38461538],
       [0.87002219, 0.49301257],
       [0.94242775, 0.33440985],
       [0.93528626, 0.3538921 ],
       [0.9149178 , 0.40364021],
       [0.

### Binarization

In [13]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0).fit(X)
binarizer.transform(X)

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.

### Encoding Categorical Features

In [15]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
enc.fit_transform(y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Imputing Missing Values

In [62]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)

array([[-0.91090798, -1.59775374],
       [-1.0271058 ,  0.08448757],
       [ 0.59966379, -1.59775374],
       [ 0.01867465, -0.96691325],
       [ 0.48346596, -0.33607276],
       [-1.25950146,  0.29476773],
       [-1.37569929,  0.71532806],
       [-0.79471015, -1.17719341],
       [-1.14330363,  0.71532806],
       [ 2.45882905,  1.55644871],
       [-0.79471015,  0.71532806],
       [-0.79471015,  1.34616854],
       [-0.21372101, -0.33607276],
       [ 0.83205945, -0.1257926 ],
       [-0.44611666,  1.76672887],
       [ 1.41304859,  0.29476773],
       [ 0.01867465, -0.54635292],
       [ 2.22643339, -0.96691325],
       [-0.32991883, -1.17719341],
       [ 0.13487248,  0.29476773],
       [-1.0271058 ,  1.13588838],
       [-1.49189712, -1.59775374],
       [ 0.59966379, -0.54635292],
       [-1.60809495, -0.33607276],
       [-0.91090798,  1.13588838],
       [ 1.64544425, -0.1257926 ],
       [ 0.25107031,  0.71532806],
       [ 0.48346596, -1.8080339 ],
       [ 1.8778399 ,

### Generating Polynomial Features

In [21]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(5)
poly.fit_transform(X_train) 

array([[1.00000000e+00, 5.00000000e+00, 2.30000000e+00, ...,
        3.04175000e+02, 1.39920500e+02, 6.43634300e+01],
       [1.00000000e+00, 4.90000000e+00, 3.10000000e+00, ...,
        7.15281910e+02, 4.52525290e+02, 2.86291510e+02],
       [1.00000000e+00, 6.30000000e+00, 2.30000000e+00, ...,
        4.82908230e+02, 1.76299830e+02, 6.43634300e+01],
       ...,
       [1.00000000e+00, 5.60000000e+00, 3.00000000e+00, ...,
        8.46720000e+02, 4.53600000e+02, 2.43000000e+02],
       [1.00000000e+00, 7.70000000e+00, 3.00000000e+00, ...,
        1.60083000e+03, 6.23700000e+02, 2.43000000e+02],
       [1.00000000e+00, 5.40000000e+00, 3.40000000e+00, ...,
        1.14610464e+03, 7.21621440e+02, 4.54354240e+02]])

## 模型

### 监督学习

#### Linear Regression

In [31]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(normalize=True)
lr.fit(X, y)
y_pred = lr.predict(X_test)
y_pred

array([ 0.99503595,  1.60221544,  0.06895097,  1.6466104 ,  1.48027593,
        1.18075657,  0.01486294,  0.17150435,  1.38179294,  1.68538268,
        1.61190851, -0.02390934,  1.11697547,  1.28893263,  1.74916378,
        1.50935514,  1.15167736,  0.24497852,  1.18075657,  1.90580519,
        0.27968041,  0.42100606,  1.51904821,  0.26998734,  1.66192615,
        1.4261879 ,  1.22515153,  1.35271373,  1.51904821,  1.07820319,
        1.04912398,  1.60221544,  0.98534288,  1.31801184,  1.60221544,
        1.66599654,  1.19607232,  0.93125485])

#### Support Vector Machines (SVM)

In [32]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(X, y)
y_pred = svc.predict(X_test)
y_pred

array([1, 2, 0, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 2, 1, 0, 2, 2, 0, 0,
       2, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1])

#### Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X, y)
y_pred = gnb.predict(X_test)
y_pred

array([1, 2, 0, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 2, 1, 0, 2, 2, 0, 0,
       2, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1])

#### KNN

In [34]:
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X_test)
y_pred

array([1, 1, 0, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 2, 2, 0, 2, 2, 0, 0,
       2, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1])

### 无监督学习

#### Principal Component Analysis (PCA)

In [28]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit_transform(X_train)

array([[-0.70718738, -0.83172028],
       [-0.88369354, -0.0450529 ],
       [ 0.58678167, -0.70664389],
       [ 0.06023749, -0.45614195],
       [ 0.42951803, -0.11904867],
       [-1.09238696,  0.03524066],
       [-1.21116556,  0.22469156],
       [-0.62689382, -0.62302686],
       [-1.0120934 ,  0.24393409],
       [ 2.03504005,  0.94033749],
       [-0.71348516,  0.27279787],
       [-0.74234894,  0.57140611],
       [-0.16769846, -0.17677623],
       [ 0.71850501,  0.00935119],
       [-0.46298322,  0.79934205],
       [ 1.19694289,  0.25652966],
       [ 0.04099497, -0.25706979],
       [ 1.95142302, -0.273338  ],
       [-0.22874949, -0.58454182],
       [ 0.10204601,  0.15069579],
       [-0.93179984,  0.45262751],
       [-1.20486778, -0.87982658],
       [ 0.53867537, -0.20896349],
       [-1.36213142, -0.29223136],
       [-0.83226376,  0.46224877],
       [ 1.41525757,  0.07670002],
       [ 0.18233956,  0.35938921],
       [ 0.49686685, -0.81580123],
       [ 1.63357226,

#### K Means

In [35]:
from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3, random_state=0)
k_means.fit(X, y)
y_pred = k_means.predict(X_test)
y_pred

array([0, 2, 1, 2, 0, 2, 1, 1, 2, 2, 2, 1, 2, 0, 2, 2, 0, 1, 2, 2, 1, 1,
       2, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 0], dtype=int32)

## 评估

### 分类

#### Accuracy Score

In [36]:
knn.score(X_test, y_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.3157894736842105

#### Classification Report

In [39]:
from sklearn.metrics import classification_report

classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00         8\n           1       0.00      0.00      0.00        11\n           2       0.71      0.63      0.67        19\n\n   micro avg       0.32      0.32      0.32        38\n   macro avg       0.24      0.21      0.22        38\nweighted avg       0.35      0.32      0.33        38\n'

#### Confusion Matrix

In [40]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 0,  8,  0],
       [ 6,  0,  5],
       [ 7,  0, 12]])

### 回归

#### Mean Absolute Error

In [42]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

0.868421052631579

#### Mean Squared Error

In [43]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

1.236842105263158

In [44]:
#### R² Score

In [45]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

-0.9734806629834258

### 聚类

#### Adjusted Rand Index

In [46]:
from sklearn.metrics import adjusted_rand_score

adjusted_rand_score(y_test, y_pred)

0.3273680853325774

#### V-measure

In [51]:
from sklearn.metrics import v_measure_score

v_measure_score(y_test, y_pred)

0.5040766075368869

### Cross-Validation

In [54]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))

[0.82758621 0.82758621 0.82142857 0.88461538]
[-4.31567384 -1.89773191]


## 调试

### Grid Search

In [61]:
from sklearn.model_selection import GridSearchCV

params = {"n_neighbors": np.arange(1,3), "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,
 param_grid=params)
grid.fit(X_train, y_train)

print((grid.best_score_))
print((grid.best_estimator_.n_neighbors))

0.8303571428571429
2


### Randomized Parameter Optimization

In [60]:
from sklearn.model_selection import RandomizedSearchCV
params = {"n_neighbors": list(range(1,5)), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
 param_distributions=params,
 cv=4,
 n_iter=8,
 random_state=5)
rsearch.fit(X_train, y_train)
print((rsearch.best_score_))

0.8214285714285714


> 参考：

1. [Scikit-Learn Cheat Sheet: Python Machine Learning](https://www.datacamp.com/community/blog/scikit-learn-cheat-sheet)