In [None]:
# Sample Generation

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets.samples_generator import make_blobs


# X为样本特征，y为样本簇类别，共1000个样本，每个样本2个特征，共2个簇
x_all, y_all = make_blobs(n_samples=1000, n_features=2, 
                          centers=[[5,5], [6,6]],
                          cluster_std=[0.3, 0.4],
                          random_state=9)

plt.figure()
colors = ['r', 'b']
for y, c in zip(np.unique(y_all), colors):
    plt.scatter(X_all[y_all==y, 0], x_all[y_all==y, 1], c=c, label=y, marker='o')

In [None]:
# First Example

from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn import svm

    
x_all, y_all = make_blobs(n_samples=1000, n_features=2, 
                          centers=[[5,5], [6,6]],
                          cluster_std=[0.3, 0.4],
                          random_state=9)

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=1)

clf = svm.SVC()
clf.fit(x_train, y_train)

prediction_train = clf.predict(x_train)
prediction_test = clf.predict(x_test)
print('[Train Set] prediction and real:')
print(list(zip(prediction_train, y_train))[:10])
print('[Test Set] prediction and real:')
print(list(zip(prediction_test, y_test))[:10])

score_train = clf.score(x_train, y_train)
score_test = clf.score(x_test, y_test)
print('[Train Set] Data Number: {}, Score {}'.format(len(y_train), score_train))
print('[Test  Set] Data Number: {}, Score {}'.format(len(y_test), score_test))


In [None]:
# Example with evaluation

from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

    
x_all, y_all = make_blobs(n_samples=1000, n_features=2, 
                          centers=[[5,5], [6,6]],
                          cluster_std=[0.3, 0.4],
                          random_state=9)

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=1)

clf = LogisticRegression(C=1, random_state=1)
clf.fit(x_train, y_train)

y_pred_test = clf.predict(x_test)
y_pred_train = clf.predict(x_train)

print('Value Y Distribution:')
total_num = y_test.shape[0]
true_num = y_test.sum()
true_ratio = round(true_num / total_num, 2)
print('[Test  Set] Total Number: {}, [1] Number: {}, Ratio: {}'.format(total_num, true_num, true_ratio))
total_num = y_train.shape[0]
true_num = y_train.sum()
true_ratio = round(true_num / total_num, 2)
print('[Train Set] Total Number: {}, [1] Number: {}, Ratio: {}'.format(total_num, true_num, true_ratio))

print('\nAccuracy Score:')
score_test = accuracy_score(y_test, y_pred_test)
print('[Test  Set] Data Number: {}, Accuracy Score {}'.format(len(y_test), score_test))
score_train = accuracy_score(y_train, y_pred_train)
print('[Train Set] Data Number: {}, Accuracy Score {}'.format(len(y_train), score_train))

ConfusionMatrix = pd.DataFrame({'Predict: 0 [N]': ['TN', 'FN'], 'Predict: 1 [P]': ['FP', 'TP']})
print('\nConfusion Matrix:\n{}'.format(ConfusionMatrix))
print('[Test  Set] Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_test))
print('[Train Set] Confusion Matrix:')
print(confusion_matrix(y_train, y_pred_train))

print('\nClassification Report:')
print('[Test  Set] Report: ')
print(classification_report(y_test, y_pred_test))
print('[Train Set] Report: ')
print(classification_report(y_train, y_pred_train))


In [3]:
# Cross Validation

from sklearn.datasets.samples_generator import make_blobs
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import svm


x_all, y_all = make_blobs(n_samples=1000, n_features=2, 
                          centers=[[5,5], [6,6]],
                          cluster_std=[0.3, 0.4],
                          random_state=9)

clf = svm.SVC(kernel='linear', C=1)

print('Cross Validation Scores')
scores = cross_val_score(clf, x_all, y_all, cv=5)
print(scores)
# The mean score and the 95% confidence interval of the score estimate:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

print('Cross Validation Prediction')
predicted = cross_val_predict(clf, x_all, y_all, cv=5)
print(accuracy_score(y_all, predicted))
print(len(predicted))


Cross Validation Scores
[ 0.95   0.99   0.97   0.975  0.99 ]
Accuracy: 0.97 (+/- 0.03)
Cross Validation Prediction
0.975
1000


In [None]:
# Cross validation iterators

import numpy as np
from sklearn.model_selection import KFold, LeaveOneOut, LeavePOut

X = np.linspace(0, 19, 20).reshape(10, 2)
print(X)

print('K Fold')
kf = KFold(n_splits=5)
for train, test in kf.split(X):
    print("%s %s" % (train, test))
    
print('Leave One Out (LOO)')
loo = LeaveOneOut()
for train, test in loo.split(X):
    print("%s %s" % (train, test))

print('Leave P Out (LPO)')
lpo = LeavePOut(p=2)
for train, test in lpo.split(X):
    print("%s %s" % (train, test))


In [None]:
# Grid Search

from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import svm


x_all, y_all = make_blobs(n_samples=1000, n_features=2, 
                          centers=[[5,5], [6,6]],
                          cluster_std=[0.3, 0.4],
                          random_state=9)

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.8, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'],
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'],
                     'C': [1, 10, 100, 1000]}]

# Tuning hyper-parameters
clf = GridSearchCV(svm.SVC(),tuned_parameters, cv=5)
clf.fit(x_train, y_train)

print("Best score found on test set:")
print(clf.best_score_)
print()

print("Best score found on test set:")
print(clf.best_estimator_)
print()

print("Best parameters set found on test set:")
print(clf.best_params_)
print(clf.best_estimator_.get_params())
print()

print("Grid scores on test set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, std, params in zip(means, stds, params):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(x_test)
print('Score of best_estimator_: {}'.format(clf.score(x_test, y_test)))
print(classification_report(y_true, y_pred))


In [None]:
# Model evaluation


from sklearn.metrics import *


print('--- Binary classification ---')
y_true = [0, 1, 0, 1]
y_pred = [0, 1, 0, 0]
print('y_true: {}'.format(y_true))
print('y_pred: {}'.format(y_pred))

ratio = accuracy_score(y_true, y_pred)
print('Ratio of correct predictions: {}'.format(ratio))

count = accuracy_score(y_true, y_pred, normalize=False)
print('Count of correct predictions: {}'.format(count))

# http://scikit-learn.org/stable/modules/model_evaluation.html#precision-recall-and-f-measures
P = precision_score(y_true, y_pred)
print('查准率P(Precision) = 预测为某分类且正确的数量 / 预测为某分类的总数量: {}'.format(P))

R = recall_score(y_true, y_pred)
print('查全率R(Recall) = 预测为某分类且正确的数量 / 该分类在样本中的总数量: {}'.format(R))

F1 = f1_score(y_true, y_pred)
print('F1 Score = (P * R) / 2(P + R): {}'.format(F1))

F_05 = fbeta_score(y_true, y_pred, beta=0.5)
F_10 = fbeta_score(y_true, y_pred, beta=1)
F_20 = fbeta_score(y_true, y_pred, beta=2)
print('F_05: {}, F_10: {}, F_20: {}'.format(F_05, F_10, F_20))

matrix = confusion_matrix(y_true, y_pred)
print('Confusion Matrix:')
print(matrix)

print('# sklearn.metrics.classification_report')
target_names = ['class 0', 'class 1']
report = classification_report(y_true, y_pred, target_names=target_names)
print('Classification Report:')
print(report)
print()


print('--- Multiclass and multilabel classification ---')
y_true = [2, 0, 2, 2, 0, 1, 0, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2, 0, 1, 0]
print('y_true: {}'.format(y_true))
print('y_pred: {}'.format(y_pred))

ratio = accuracy_score(y_true, y_pred)
print('Ratio of correct predictions: {}'.format(ratio))

count = accuracy_score(y_true, y_pred, normalize=False)
print('Count of correct predictions: {}'.format(count))

matrix = confusion_matrix(y_true, y_pred)
print('Confusion Matrix:')
print(matrix)

target_names = ['class 0', 'class 1', 'class 2']
report = classification_report(y_true, y_pred, target_names=target_names)
print('Classification Report:')
print(report)


In [None]:
# One-hot coding

from sklearn.feature_extraction import DictVectorizer


measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Fransisco', 'temperature': 18.}
]

vec = DictVectorizer()

print(vec.fit_transform(measurements).toarray())
print(vec.get_feature_names())


In [None]:
# Standardization, or mean removal and variance scaling

from sklearn import preprocessing
import numpy as np


X = np.array([[ 1., -10.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  10., -1.]])
print('Original Matrix:')
print(X)

print('preprocessing.scale')
x_scaled = preprocessing.scale(X)
print(x_scaled)                                          
print(x_scaled.mean(axis=0))
print(x_scaled.std(axis=0))

print('preprocessing.StandardScaler')
standard_scaler = preprocessing.StandardScaler()
x_train_standard = standard_scaler.fit_transform(X)
print(x_train_standard)
print(x_train_standard.mean(axis=0))
print(x_train_standard.std(axis=0))

print('preprocessing.MinMaxScaler')
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(X)
print(min_max_scaler.scale_)
print(min_max_scaler.min_)
print(x_train_minmax)
print(x_train_minmax.mean(axis=0))
print(x_train_minmax.std(axis=0))

print('preprocessing.MaxAbsScaler')
max_abs_scaler = preprocessing.MaxAbsScaler()
x_train_maxabs = max_abs_scaler.fit_transform(X)
print(x_train_maxabs)
print(x_train_maxabs.mean(axis=0))
print(x_train_maxabs.std(axis=0))


In [None]:
from sklearn import preprocessing
import numpy as np
import pandas as pd


A1 = np.random.normal(loc=10.0, scale=5.0, size=5000)
A2 = preprocessing.scale(A1)
A3 = (A1 - A1.mean()) / A1.std()

pd.DataFrame({'A1': A1, 'A2': A2, 'A3': A3})

In [None]:
# Normalization

from sklearn import preprocessing
import numpy as np


X = np.array([[ 1., -10.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  10., -1.]])

x_normalized = preprocessing.normalize(X, norm = 'l1')
print(x_normalized)
print(x_normalized.mean(axis=0))
print(x_normalized.std(axis=0))


In [None]:
# Binarization

from sklearn import preprocessing
import numpy as np


X = np.array([[ 1., -10.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  10., -1.]])

binarizer = preprocessing.Binarizer()
# binarizer = preprocessing.Binarizer(threshold=1.1)

x_binarizered = binarizer.transform(X)
print(x_binarizered)


In [None]:
# Standardization, or mean removal and variance scaling

from sklearn import preprocessing
import numpy as np


X = np.array([[ 1., 10., 51.],
              [ 2., 20., 52.],
              [ 3., 30., 53.],
              [ 4., 40., 54.],
              [ 5., 50., 55.],
              [ 6., 60., 56.],
              [ 7., 70., 57.]])
print('Original Matrix:')
print(X)

print('preprocessing.scale')
x_scaled = preprocessing.scale(X)
print(x_scaled)                                          
print(x_scaled.mean(axis=0))
print(x_scaled.std(axis=0))

print('preprocessing.StandardScaler')
standard_scaler = preprocessing.StandardScaler()
x_train_standard = standard_scaler.fit_transform(X)
print(x_train_standard)
print(x_train_standard.mean(axis=0))
print(x_train_standard.std(axis=0))

print('preprocessing.MinMaxScaler')
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(X)
print(min_max_scaler.scale_)
print(min_max_scaler.min_)
print(x_train_minmax)
print(x_train_minmax.mean(axis=0))
print(x_train_minmax.std(axis=0))

print('preprocessing.MaxAbsScaler')
max_abs_scaler = preprocessing.MaxAbsScaler()
x_train_maxabs = max_abs_scaler.fit_transform(X)
print(x_train_maxabs)
print(x_train_maxabs.mean(axis=0))
print(x_train_maxabs.std(axis=0))

print('preprocessing.normalize')
x_normalized = preprocessing.normalize(X)
print(x_normalized)
print(x_normalized.mean(axis=0))
print(x_normalized.std(axis=0))


In [None]:
# PCA(Principal Component Analysis)
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import PCA


# X为样本特征，Y为样本簇类别，共1000个样本，每个样本3个特征，共4个簇
X, y = make_blobs(n_samples=10000, n_features=3, 
                  centers=[[5,5,5], [6,6,6], [7,7,7], [8,8,8]],
                  cluster_std=[0.1, 0.2, 0.2, 0.2],
                  random_state=9)

fig = plt.figure()
ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
plt.scatter(X[:, 0], X[:, 1], X[:, 2], marker='o')

# 不降维，只观察三个特征的方差
pca = PCA(n_components=3)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

# 特征冲三维降到二维
pca = PCA(n_components=2, whiten=False)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

# 观察降维后的数据
x_new = pca.transform(X)
plt.figure()
plt.scatter(x_new[:, 0], x_new[:, 1], marker='o')
plt.show()

# 指定主成分方差和比例
pca = PCA(n_components=0.95)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)
print(pca.n_components_)

# 使用MLE算法自动选择降维维度
pca = PCA(n_components='mle', svd_solver='full')
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)
print(pca.n_components_)