In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# 1.导入数据
iris = datasets.load_iris()
X = iris.data[iris.target != 2]
y = iris.target[iris.target != 2]
# 2.数据变换
transformer = MinMaxScaler()
transformer.fit(X)
X = transformer.transform(X)

# 3.TSNE高维数据展示
tnse = TSNE(n_components=2)
tnse_X = tnse.fit_transform(X)
plt.plot(tnse_X[y==0][:, 0], tnse_X[y==0][:, 1], 'ro', label='setosa')
plt.plot(tnse_X[y==1][:, 0], tnse_X[y==1][:, 1], 'b^', label='versicolor')
plt.legend()

# 4.数据拆分(训练集/测试集)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

# 5.构建逻辑回归模型
lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

train_dataset = np.concatenate((X_train,y_train.reshape((y_train.shape[0], 1)), y_train_pred.reshape((y_train.shape[0], 1))), axis=1)
test_dataset  = np.concatenate((X_test,y_test.reshape((y_test.shape[0], 1)), y_test_pred.reshape((y_test.shape[0], 1))), axis=1)
train_right_rate = train_dataset[train_dataset[:, 4] == train_dataset[:, 5]].shape[0]/train_dataset.shape[0]
test_right_rate = test_dataset[test_dataset[:, 4] == test_dataset[:, 5]].shape[0]/test_dataset.shape[0]
print("训练集正确率：%d %%" %  (train_right_rate*100))
print("测试集正确率：%d %%" % (test_right_rate*100))

In [None]:
# 分类算法
# 逻辑回归算例
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression

# 1.载入数据集
iris = load_iris()
X = iris.data[:, :2] #获取2列数据集合
Y = iris.target

# 2.逻辑回归模型
clf = LogisticRegression()
clf.fit(X,Y)

# 3.meshgrid生成两个网格矩阵
h = 0.01
x_min, x_max = X[:, 0].min()-.5, X[:, 0].max()+.5
y_min, y_max = X[:, 1].min()-.5, X[:, 1].max()+.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
#  矩阵中个点类别预测
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
#  绘制图形
plt.figure(1, figsize=(8, 6))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# 4.绘制散点图
plt.scatter(X[:50,0], X[:50,1], color='red', marker='o', label='Setosa')
plt.scatter(X[50:100,0], X[50:100,1], color='blue', marker='x', label='Versicolor')
plt.scatter(X[100:,0], X[100:,1], color='green', marker='s', label='Virginica')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.legend(loc=2)
plt.show()

In [None]:
# 决策树分类
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度',u'类别'
iris = load_iris()
data = np.concatenate((iris.data, iris.target.reshape(150,1)),axis=1)
data = pd.DataFrame(data, columns= iris_feature)
data['类别']=pd.Categorical(data['类别']).codes
x_train = data[['花萼长度','花瓣长度']]
y_train = data['类别']
model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
model.fit(x_train, y_train)
N, M = 500, 500  # 横纵各采样多少个值
x1_min, x2_min = x_train.min(axis=0)
x1_max, x2_max = x_train.max(axis=0)
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
y_predict=model.predict(x_show)
plt.figure(figsize=(8,6))
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.pcolormesh(x1, x2, y_predict.reshape(x1.shape), cmap=cm_light)
plt.scatter(x_train['花萼长度'],x_train['花瓣长度'],c=y_train,cmap=cm_dark,marker='o',edgecolors='k')
plt.xlabel('花萼长度')
plt.ylabel('花瓣长度')
plt.title('鸢尾花分类')
plt.grid(True,ls=':')
plt.show()