In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
# jblib用来保存机器学习的模型，把各个参数保存到磁盘上，使用时加载
import joblib
import pandas as pd
import numpy as np

In [2]:
"""
线性回归直接预测房子价格
:return: None
"""
# 获取数据,直接网上下载，数据量不多
lb = load_boston()

print("获取特征值")
# print(lb.data)
# print("目标值")
# print(lb.target)
# print(lb.DESCR)
# print(lb.feature_names)
print('-' * 50)
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=1)
#
print(x_train.shape)
#
# # 进行标准化处理(?) 目标值处理？
# # 特征值和目标值是都必须进行标准化处理, 实例化两个标准化API
std_x = StandardScaler()
# 对数据进行处理
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)

# 目标值进行了标准化
std_y = StandardScaler()
# 不管原来的是几维都变为一个只有一列的二维数组
temp = y_train.reshape(-1, 1)
# 输出还没标准化的房价
print(temp)
# 目标值是一维的，这里需要传进去2维的，标准化的fit_transform需要是二维的
y_train = std_y.fit_transform(y_train.reshape(-1, 1))


# transform不再进行计算，使用上面的直接标准化
y_test = std_y.transform(y_test.reshape(-1, 1))
# 输出标准化后的目标房价
y_train
# 以上都是数据处理

获取特征值
--------------------------------------------------
(379, 13)
[[20.6]
 [23.1]
 [28. ]
 [20. ]
 [23.1]
 [25. ]
 [ 9.7]
 [23.9]
 [36.1]
 [13.4]
 [12.7]
 [39.8]
 [10.4]
 [20.6]
 [17.8]
 [19.5]
 [23.7]
 [28.5]
 [24.3]
 [23.8]
 [19.1]
 [28.4]
 [20.5]
 [33.8]
 [14.5]
 [20.4]
 [16. ]
 [13.3]
 [30.8]
 [27.5]
 [24.4]
 [24.4]
 [25.1]
 [43.8]
 [21.9]
 [26.2]
 [14.2]
 [20.8]
 [20.1]
 [23.1]
 [13.1]
 [16.2]
 [24.8]
 [20.2]
 [22.5]
 [14.8]
 [28.7]
 [20.1]
 [23.4]
 [32. ]
 [19.1]
 [50. ]
 [20.9]
 [21.7]
 [22. ]
 [17.2]
 [30.3]
 [12.3]
 [21.4]
 [20.5]
 [35.2]
 [19.6]
 [22. ]
 [21.7]
 [14.1]
 [21.1]
 [15. ]
 [11.9]
 [20. ]
 [41.3]
 [18.7]
 [50. ]
 [50. ]
 [18.4]
 [17.9]
 [28.1]
 [16.1]
 [17.2]
 [28.6]
 [23.6]
 [20.4]
 [19.6]
 [18.8]
 [22.6]
 [17.7]
 [30.5]
 [18.2]
 [20.6]
 [24.4]
 [17.3]
 [13.3]
 [22.8]
 [20.5]
 [21.2]
 [18.8]
 [18.9]
 [18.2]
 [23.1]
 [32.7]
 [24. ]
 [10.2]
 [19.5]
 [33.1]
 [13.4]
 [15.2]
 [24.8]
 [24.3]
 [ 9.5]
 [24.2]
 [18.5]
 [44. ]
 [50. ]
 [24.7]
 [21.5]
 [ 8.4]
 [21.8]
 [50.


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

array([[-0.19582006],
       [ 0.0847902 ],
       [ 0.63478631],
       [-0.26316652],
       [ 0.0847902 ],
       [ 0.298054  ],
       [-1.41928078],
       [ 0.17458549],
       [ 1.54396354],
       [-1.0039776 ],
       [-1.08254847],
       [ 1.95926673],
       [-1.34070991],
       [-0.19582006],
       [-0.51010354],
       [-0.31928857],
       [ 0.15213666],
       [ 0.69090836],
       [ 0.21948313],
       [ 0.16336107],
       [-0.36418621],
       [ 0.67968395],
       [-0.20704447],
       [ 1.28580211],
       [-0.88050909],
       [-0.21826888],
       [-0.71214293],
       [-1.01520201],
       [ 0.9490698 ],
       [ 0.57866426],
       [ 0.23070754],
       [ 0.23070754],
       [ 0.30927841],
       [ 2.40824314],
       [-0.04990272],
       [ 0.43274692],
       [-0.91418232],
       [-0.17337123],
       [-0.25194211],
       [ 0.0847902 ],
       [-1.03765083],
       [-0.68969411],
       [ 0.27560518],
       [-0.2407177 ],
       [ 0.01744374],
       [-0

In [9]:
# # estimator预测
# # # 正规方程求解方式预测结果，正规方程进行线性回归，这是一个正规方程的接口
lr = LinearRegression()
# #
lr.fit(x_train, y_train)
#
print('回归系数', lr.coef_)  # 回归系数可以看特征与目标之间的相关性
#
y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格，因为房价经过了标准化
y_lr_predict = std_y.inverse_transform(y_predict)
# 保存训练好的模型,模型参数就在lr里面
joblib.dump(lr, "./tmp/test.pkl")
print("正规方程测试集里面每个房子的预测价格：", y_lr_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

回归系数 [[-0.12026411  0.15044778  0.02951803  0.07470354 -0.28043353  0.22170939
   0.02190624 -0.35275513  0.29939558 -0.2028089  -0.23911894  0.06305081
  -0.45259462]]
正规方程测试集里面每个房子的预测价格： [[32.37816533]
 [27.95684437]
 [18.07213891]
 [21.63166556]
 [18.93029508]
 [19.96277202]
 [32.2834674 ]
 [18.06715668]
 [24.72989076]
 [26.85359369]
 [27.23326816]
 [28.57021239]
 [21.18778302]
 [26.94393815]
 [23.37892579]
 [20.89176865]
 [17.11746934]
 [37.73997945]
 [30.51980066]
 [ 8.44489436]
 [20.86557977]
 [16.21989418]
 [25.13605925]
 [24.77658813]
 [31.40497629]
 [11.02741407]
 [13.82097563]
 [16.80208261]
 [35.94637198]
 [14.7155729 ]
 [21.23939821]
 [14.15079469]
 [42.72492585]
 [17.83887162]
 [21.84610225]
 [20.40178099]
 [17.50287927]
 [27.00093206]
 [ 9.80760408]
 [20.00288662]
 [24.27066782]
 [21.06719021]
 [29.47089776]
 [16.48482565]
 [19.38852695]
 [14.54778282]
 [39.39838319]
 [18.09810655]
 [26.22164983]
 [20.60676525]
 [25.09994066]
 [24.48366723]
 [25.02297948]
 [26.84986898]
 

# 2 加载保存的模型

In [10]:
model = joblib.load("./tmp/test.pkl")
# # 因为目标值进行了标准化，一定要把预测后的值逆向转换回来
y_predict = model.predict(x_test)

#
print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test),
                                               std_y.inverse_transform(y_predict)))

保存的模型预测的结果： [[ 1.12620955]
 [ 0.62994234]
 [-0.47955756]
 [-0.08002168]
 [-0.38323459]
 [-0.26734514]
 [ 1.11558027]
 [-0.48011678]
 [ 0.26773583]
 [ 0.50610896]
 [ 0.54872518]
 [ 0.69878929]
 [-0.12984488]
 [ 0.51624959]
 [ 0.11609798]
 [-0.16307075]
 [-0.58671359]
 [ 1.72804157]
 [ 0.91761907]
 [-1.56015899]
 [-0.16601029]
 [-0.68746111]
 [ 0.31332585]
 [ 0.27297733]
 [ 1.01697482]
 [-1.27028638]
 [-0.95672557]
 [-0.62211389]
 [ 1.5267197 ]
 [-0.8563123 ]
 [-0.12405138]
 [-0.91970532]
 [ 2.28757241]
 [-0.50574043]
 [-0.05595243]
 [-0.21806897]
 [-0.54345359]
 [ 0.52264682]
 [-1.40720286]
 [-0.26284251]
 [ 0.21619076]
 [-0.14338071]
 [ 0.79988591]
 [-0.65772411]
 [-0.33180076]
 [-0.87514574]
 [ 1.91418761]
 [-0.47664284]
 [ 0.43517699]
 [-0.1950607 ]
 [ 0.30927175]
 [ 0.24009869]
 [ 0.30063331]
 [ 0.50569088]
 [-1.94512422]
 [ 0.20018782]
 [-1.30384514]
 [ 0.50366068]
 [-0.6220835 ]
 [ 1.47453167]
 [-0.31823582]
 [ 0.57109939]
 [-0.64702253]
 [-0.35840699]
 [-1.27347275]
 [ 1.08939349

In [11]:
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

0.375

In [14]:
#人工求均方误差
(np.square(3 - 2.5) + np.square(0.5) + 1) / 4

0.375

In [10]:
# 梯度下降去进行房价预测,数据量大要用这个
# 默认可以去调 eta0 = 0.008，会改变learning_rate,默认0.01
# learning_rate='invscaling'(eta=eta0/pow(t,power_t)),='constant'(学习率就是eta0)
# 各个参数之间是有关联的
# alpha会影响学习率的值，由alpha来算学习率
# 下面学习率为0.008，正则化选择l1,正则化力度选择0.005
sgd = SGDRegressor(eta0=0.008, penalty='l1', alpha=0.005)
# # 训练
sgd.fit(x_train, y_train)
#
print('梯度下降的回归系数', sgd.coef_)
#
# 预测测试集的房子价格，预测出的y是一维的，需要reshape
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_predict = sgd.predict(x_test)
print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
print("梯度下降的原始房价量纲均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

梯度下降的回归系数 [-0.09161381  0.07894594 -0.01997965  0.07736127 -0.18054122  0.26622108
  0.         -0.23891603  0.09441201 -0.02523685 -0.22153748  0.06690733
 -0.4268276 ]
梯度下降测试集里面每个房子的预测价格： [[30.32788625]
 [28.2472966 ]
 [18.30943245]
 [22.59556785]
 [18.35680256]
 [20.78684621]
 [30.24536495]
 [18.66209012]
 [23.78774605]
 [26.97567026]
 [26.37424819]
 [29.42246868]
 [21.60787196]
 [25.88755575]
 [23.02476327]
 [19.47432779]
 [16.92218956]
 [37.85566623]
 [29.94252206]
 [ 9.68559596]
 [20.93595383]
 [17.3629154 ]
 [25.40769352]
 [25.21312443]
 [30.55409234]
 [10.62699034]
 [14.44340854]
 [19.41367655]
 [35.63512925]
 [13.89827052]
 [23.99610575]
 [14.80961808]
 [40.63030223]
 [17.96567407]
 [24.16781208]
 [20.94382202]
 [17.53984421]
 [28.18178196]
 [ 8.09947878]
 [19.45922413]
 [26.43987001]
 [22.05015656]
 [28.68910453]
 [15.49251056]
 [18.56596167]
 [14.90704937]
 [39.74153394]
 [17.56879259]
 [25.86973592]
 [20.91672203]
 [24.75347158]
 [24.58762127]
 [25.75260948]
 [26.56115866]


  y = column_or_1d(y, warn=True)


#3 岭回归

In [16]:
# # # 岭回归去进行房价预测
rd = Ridge(alpha=0.05)

rd.fit(x_train, y_train)

print(rd.coef_)
#
# # 预测测试集的房子价格
y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
y_predict = rd.predict(x_test)
# print("岭回归里面每个房子的预测价格：", y_rd_predict)
#
print("岭回归的均方误差：", mean_squared_error(y_test, y_predict))
print("岭回归的均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))

[[-0.12019408  0.15027489  0.02932631  0.07472724 -0.28019156  0.22179958
   0.0218258  -0.35250679  0.29879635 -0.20224632 -0.23906031  0.06305591
  -0.45246484]]
岭回归的均方误差： 0.27588055100713926
岭回归的均方误差： 21.897473825960407


In [19]:
np.log(0.3)

-1.2039728043259361

In [23]:
"""
逻辑回归做二分类进行癌症预测（根据细胞的属性特征）
:return: NOne
"""
# 构造列标签名字
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
          'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
          'Mitoses', 'Class']

# 读取数据
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    names=column)

print(data)

# 缺失值进行处理
data = data.replace(to_replace='?', value=np.nan)
#直接删除，哪一行有空值，就删除对应的样本，也可以通过决策树特征推断特征
data = data.dropna()
print('-' * 50)
print(data)
# 进行数据的分割，左闭右开
x_train, x_test, y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25,
                                                    random_state=1)

# 进行标准化处理
std = StandardScaler()

x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
#
# # 逻辑回归预测
# C正则化力度，正则化强度的倒数，值越小正则化越强，高阶项系数越小
# solver = 'liblinear'
lg = LogisticRegression(C=0.8, solver='newton-cg')
#
lg.fit(x_train, y_train)
# 逻辑回归的权重参数，了解
print(lg.coef_)

y_predict = lg.predict(x_test)
print(y_predict)
print("准确率：", lg.score(x_test, y_test))
print(lg.predict_proba(x_test))  #得出对应分类的概率
# 为什么还要看下召回率，labels和target_names对应
# macro avg 平均值  weighted avg 加权平均值
print("召回率：", classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))
#AUC计算要求是二分类，不需要是0和1
print("AUC指标：", roc_auc_score(y_test, y_predict))

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

In [21]:
print(x_train.shape)
print(x_test.shape)


(512, 9)
(171, 9)
