In [1]:
import time

from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

load直接加载的内存的，数据集比较小，并不会保存到本地磁盘
fetch数据集比较大，下载下来后会存在本地磁盘，下一次就不会再连接sklearn的服务器


# 鸢尾花数据集

In [2]:
#鸢尾花数据集，查看特征，目标，样本量

li = load_iris() # 使用load_iris()函数加载鸢尾花数据集

print("获取特征值")
print(type(li.data))
print('-' * 50)
print(li.data.shape) # 150个样本，4个特征,一般看shape
li.data

获取特征值
<class 'numpy.ndarray'>
--------------------------------------------------
(150, 4)


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [3]:
print("目标值")
print(li.target) # 目标值，目标值是0,1,2，分别代表山鸢尾、变色鸢尾、维吉尼亚鸢尾
print('-' * 50)
print(li.DESCR) # 目标值的描述
print('-' * 50)
print(li.feature_names)  # 重点,特征名字
print('-' * 50)
print(li.target_names) # 目标名字

目标值
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
--------------------------------------------------
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  

In [4]:
# 注意返回值, 训练集x_train, y_train测试集  test   x_test, y_test，顺序千万别搞错了
# 默认是乱序的,random_state为了确保两次的随机策略一致，就会得到相同的随机数据，往往会带上
# x_trains是特征值，y_trains是目标值，x_tests是特征值，y_tests是目标值
x_train, x_test, y_train, y_test = train_test_split(li.data, li.target, test_size=0.25, random_state=1)

print("训练集特征值shape", x_train.shape)
print('-'*50)
print("测试集特征值shape", x_test.shape)

训练集特征值shape (112, 4)
--------------------------------------------------
测试集特征值shape (38, 4)


# 20league新闻数据集

In [5]:
# 下面是比较大的数据，需要下载一会，20类新闻
#subset代表下载的数据集类型，默认是train，只有训练集
news = fetch_20newsgroups(subset='all', data_home='data')
print(news.feature_names)  #这个数据集是没有的，因为没有特征，只有文本数据
print('-'*50)
print(news.DESCR)
print('第一个样本')
print(news.data[0])

AttributeError: feature_names

In [6]:
print('特征类型')
print(type(news.data))
print('-' * 50)
print(news.target[0:15])
from pprint import pprint
pprint(list(news.target_names))
len(news.target_names)
print('-' * 50)
print(len(news.data))
print('新闻所有的标签')
print(news.target)
print('-' * 50)
print(min(news.target), max(news.target))

特征类型
<class 'list'>
--------------------------------------------------
[10  3 17  3  4 12  4 10 10 19 19 11 19 13  0]
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
--------------------------------------------------
18846
新闻所有的标签
[10  3 17 ...  3  1  7]
--------------------------------------------------
0 19


# 加州房价数据集

In [7]:
house=fetch_california_housing(data_home='data')
print("获取特征值")
print(house.data[0])  #第一个样本特征值
print('样本的形状')
print(house.data.shape)
print('-' * 50)
type(house.data)

获取特征值
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
样本的形状
(20640, 8)
--------------------------------------------------


numpy.ndarray

In [8]:
print("目标值")
# target是房价，是连续的，所以是float64
print(house.target[0:10])
print('-' * 50)
print(house.DESCR)
print('-' * 50)
print(house.feature_names)
print('-' * 50)

目标值
[4.526 3.585 3.521 3.413 3.422 2.697 2.992 2.414 2.267 2.611]
--------------------------------------------------
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in h

# 2 分类估计器

## 1 K近邻分类器

### 预处理

In [10]:
# K近邻
"""
K-近邻预测用户签到位置
:return:None
"""
# 读取数据
data = pd.read_csv("./data/FBlocation/train.csv")

print(data.head(10))
print(data.shape)
print(data.info())
# 处理数据
# 1、缩小数据,查询数据,为了减少计算时间
data = data.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")
print(data.shape)
data.describe()

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949
5       5  3.8099  1.9586        75  178065  6289802927
6       6  6.3336  4.3720        13  666829  9931249544
7       7  5.7409  6.7697        85  369002  5662813655
8       8  4.3114  6.9410         3  166384  8471780938
9       9  6.3414  0.0758        65  400060  1253803156
(29118021, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 6 columns):
 #   Column    Dtype  
---  ------    -----  
 0   row_id    int64  
 1   x         float64
 2   y         float64
 3   accuracy  int64  
 4   time      int64  
 5   place_id  int64  
dtypes: float64(2), int64(4)
memory usage: 1.3 GB
None
(17710, 6)


Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0
mean,14505690.0,1.122538,2.632309,82.482101,397551.263128,5129895000.0
std,8353805.0,0.077086,0.070144,113.613227,234601.097883,2357399000.0
min,600.0,1.0001,2.5001,1.0,119.0,1012024000.0
25%,7327816.0,1.0492,2.5738,25.0,174069.75,3312464000.0
50%,14430710.0,1.1233,2.6423,62.0,403387.5,5261906000.0
75%,21634630.0,1.1905,2.6878,75.0,602111.75,6766325000.0
max,29112150.0,1.2499,2.7499,1004.0,786218.0,9980711000.0


In [12]:
# 处理时间的数据，unit是秒，把秒转换成日期格式
time_value = pd.to_datetime(data['time'], unit='s')

print(time_value.head(10))  #最大时间是1月10号
# 把日期格式转换成 字典格式，把年，月，日，时，分，秒转换为字典格式，
time_value = pd.DatetimeIndex(time_value)
print('-' * 50)
print(time_value[0:10])
type(time_value)

# 构造一些特征
print(type(data))

#日期，是否是周末，小时对于个人行为的影响是较大的(例如吃饭时间去饭店，看电影时间去电影院等),所以才做下面的处理
data.insert(data.shape[1], 'day', time_value.day) #data.shape[1]是代表插入到最后的意思,一个月的哪一天
data.insert(data.shape[1], 'hour', time_value.hour)#是否去一个地方打卡，早上，中午，晚上是有影响的
data.insert(data.shape[1], 'weekday', time_value.weekday) #0代表周一，6代表周日，星期几


# 把时间戳特征删除
data = data.drop(['time'], axis=1)
print('-' * 50)
data.head()

600    1970-01-01 18:09:40
957    1970-01-10 02:11:10
4345   1970-01-05 15:08:02
4735   1970-01-06 23:03:03
5580   1970-01-09 11:26:50
6090   1970-01-02 16:25:07
6234   1970-01-04 15:52:57
6350   1970-01-01 10:13:36
7468   1970-01-09 15:26:06
8478   1970-01-08 23:52:02
Name: time, dtype: datetime64[ns]
--------------------------------------------------
DatetimeIndex(['1970-01-01 18:09:40', '1970-01-10 02:11:10',
               '1970-01-05 15:08:02', '1970-01-06 23:03:03',
               '1970-01-09 11:26:50', '1970-01-02 16:25:07',
               '1970-01-04 15:52:57', '1970-01-01 10:13:36',
               '1970-01-09 15:26:06', '1970-01-08 23:52:02'],
              dtype='datetime64[ns]', name='time', freq=None)
<class 'pandas.core.frame.DataFrame'>
--------------------------------------------------


Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
600,600,1.2214,2.7023,17,6683426742,1,18,3
957,957,1.1832,2.6891,58,6683426742,10,2,5
4345,4345,1.1935,2.655,11,6889790653,5,15,0
4735,4735,1.1452,2.6074,49,6822359752,6,23,1
5580,5580,1.0089,2.7287,19,1527921905,9,11,4


In [13]:
# 把签到数量少于n个目标位置删除，place_id是标签，即目标值
# groupby是按照place_id分组，count是统计每组的数量
place_count = data.groupby('place_id').count()
place_count
place_count['x'].describe() #打卡地点总计805个，50%打卡小于2次

count     805.000000
mean       22.000000
std        88.955632
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max      1044.000000
Name: x, dtype: float64

In [14]:
# 把index变为0,1,2，3,4,5,6这种效果，从零开始排，原来的index是row_id
#只选择去的人大于3的数据，认为1,2,3的是噪音，这个地方去的人很少，不用推荐给其他人
tf = place_count[place_count.row_id > 3].reset_index()
tf  #剩余的签到地点

Unnamed: 0,place_id,row_id,x,y,accuracy,day,hour,weekday
0,1097200869,1044,1044,1044,1044,1044,1044,1044
1,1228935308,120,120,120,120,120,120,120
2,1267801529,58,58,58,58,58,58,58
3,1278040507,15,15,15,15,15,15,15
4,1285051622,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...
234,9741307878,5,5,5,5,5,5,5
235,9753855529,21,21,21,21,21,21,21
236,9806043737,6,6,6,6,6,6,6
237,9809476069,23,23,23,23,23,23,23


In [15]:
# 根据设定的地点目标值，对原本的样本进行过滤
#isin可以过滤某一列要在一组值
data = data[data['place_id'].isin(tf.place_id)]
data.shape

(16918, 8)

In [16]:
# # 取出数据当中的特征值和目标值
y = data['place_id']
# 删除目标值，保留特征值，
x = data.drop(['place_id'], axis=1)
# 删除无用的特征值，row_id是索引,这就是噪音
x = x.drop(['row_id'], axis=1)
print(x.shape)
print(x.columns)

(16918, 6)
Index(['x', 'y', 'accuracy', 'day', 'hour', 'weekday'], dtype='object')


### 训练模型

In [17]:
# 进行数据的分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# 特征工程（标准化）
std = StandardScaler()

# 对测试集和训练集的特征值进行标准化,服务于knn fit
# fit_transform是先fit，然后transform，fit_transform是先fit，然后transform，fit_transform是先fit，然后transform,fit的目的是计算均值和方差，transform的目的是对数据进行标准化
x_train = std.fit_transform(x_train)
# transform返回的是copy，不在原有的输入对象中去修改
print(std.mean_)
print(std.var_)
x_test = std.transform(x_test)  #transfrom不再进行均值和方差的计算，是在原有的基础上去标准化
print('-' * 50)
print(id(x_test))
print(std.mean_)
print(std.var_)

[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]
--------------------------------------------------
1867224433072
[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]


In [18]:
# 进行算法流程, 选择KNN分类器，设置超参数，可以通过设置n_neighbors=5，来调整结果好坏
knn = KNeighborsClassifier(n_neighbors=3)

# fit， predict,score，训练，knn的fit是不训练的，只是把训练集的特征值和目标值放入到内存中
knn.fit(x_train, y_train)

# 得出预测结果
y_predict = knn.predict(x_test)

print("预测的目标签到位置为：", y_predict[0:10])
# 得出准确率,是评估指标
print("预测的准确率:", knn.score(x_test, y_test))

预测的目标签到位置为： [5689129232 1097200869 6097504486 9632980559 6424972551 1097200869
 3952821602 3533177779 1435128522 3312463746]
预测的准确率: 0.46430260047281324


# 插入补充，调超参的方法，网格搜索

In [19]:
# 构造一些参数（超参）的值进行搜索
# weights有uniform和distance两种，uniform是每个点都一样权重，distance是距离远近的点权重不同
param = {"n_neighbors": [3, 5, 10, 12, 15],'weights':['uniform', 'distance']}

# 进行网格搜索，cv=3是3折交叉验证，用其中2折训练，1折验证
gc = GridSearchCV(knn, param_grid=param, cv=3)

gc.fit(x_train, y_train)  #你给它的x_train，它又分为训练集，验证集

# 预测准确率，为了给大家看看
print("在测试集上准确率：", gc.score(x_test, y_test))

print("在交叉验证当中最好的结果：", gc.best_score_) #最好的结果

print("选择最好的模型是：", gc.best_estimator_) #最好的模型,告诉你用了哪些参数

print("每个超参数每次交叉验证的结果：")
gc.cv_results_



在测试集上准确率： 0.49763593380614657
在交叉验证当中最好的结果： 0.4816362349278435
选择最好的模型是： KNeighborsClassifier(n_neighbors=12, weights='distance')
每个超参数每次交叉验证的结果：


{'mean_fit_time': array([0.00866604, 0.00866787, 0.00865507, 0.0083197 , 0.00766516,
        0.00800093, 0.00836881, 0.00834497, 0.00895206, 0.00834004]),
 'std_fit_time': array([4.70865085e-04, 3.13579643e-04, 9.51254238e-04, 4.80465292e-04,
        4.71540157e-04, 2.84996368e-06, 5.21553283e-04, 5.11948827e-04,
        9.29992742e-04, 4.66723976e-04]),
 'mean_score_time': array([0.19175196, 0.08292969, 0.20697196, 0.07231291, 0.20692142,
        0.09487422, 0.21400436, 0.10170929, 0.22245328, 0.11865489]),
 'std_score_time': array([0.00473896, 0.0121411 , 0.0236096 , 0.00341085, 0.00419473,
        0.00206359, 0.00347667, 0.00310893, 0.00509918, 0.00424007]),
 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 10, 10, 12, 12, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value=999999),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'uniform',

## 2 朴素贝叶斯分类器

In [20]:
"""
朴素贝叶斯进行文本分类
:return: None
"""
news = fetch_20newsgroups(subset='all', data_home='data')


In [21]:
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=1)

# 对数据集进行特征抽取
tf = TfidfVectorizer()

# 以训练集当中的词的列表进行每篇文章重要性统计['a','b','c','d']
x_train = tf.fit_transform(x_train)
#针对特征内容，可以自行打印，下面的打印可以得到特征数目，总计有15万特征
print(len(tf.get_feature_names_out()))

153196


In [22]:
import time
# 进行朴素贝叶斯算法的预测,alpha是拉普拉斯平滑系数，分子和分母加上一个系数，分母加alpha*特征词数目
mlt = MultinomialNB(alpha=1.0)

# 训练
start=time.time()
mlt.fit(x_train, y_train) #训练模型
end=time.time()
end-start #统计训练时间

0.13283467292785645

In [23]:
x_transform_test = tf.transform(x_test)  #特征数目不发生改变
print(len(tf.get_feature_names_out())) #查看特征数目

153196


In [24]:
start=time.time()
y_predict = mlt.predict(x_transform_test)

print("预测的前面10篇文章类别为：", y_predict[0:10])

# 得出准确率,这个是很难提高准确率，为什么呢？
print("准确率为：", mlt.score(x_transform_test, y_test))
end=time.time()
end-start #预测时间

预测的前面10篇文章类别为： [16 19 18  1  9 15  1  2 16 13]
准确率为： 0.8518675721561969


0.06492996215820312

In [25]:
# 把0-19总计20个分类，变为0和1
# 5是可以改为0到19的
y_test1 = np.where(y_test == 5, 1, 0)
print(y_test1.sum()) #label为5的样本数
y_predict1 = np.where(y_predict == 5, 1, 0)
print(y_predict1.sum())
# roc_auc_score的y_test只能是二分类,针对多分类如何计算AUC
print("AUC指标：", roc_auc_score(y_test1, y_predict1))

230
214
AUC指标： 0.924078924393225


## 3 决策树分类器

In [26]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [28]:
"""
决策树对泰坦尼克号进行预测生死
:return: None
"""
# 获取数据
titan = pd.read_csv("./data/titanic.txt")
titan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   row.names  1313 non-null   int64  
 1   pclass     1313 non-null   object 
 2   survived   1313 non-null   int64  
 3   name       1313 non-null   object 
 4   age        633 non-null    float64
 5   embarked   821 non-null    object 
 6   home.dest  754 non-null    object 
 7   room       77 non-null     object 
 8   ticket     69 non-null     object 
 9   boat       347 non-null    object 
 10  sex        1313 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 113.0+ KB


In [29]:
# 处理数据，找出特征值和目标值
x = titan[['pclass', 'age', 'sex']]

y = titan['survived']
print(x.info())  # 用来判断是否有空值
x.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1313 non-null   object 
 1   age     633 non-null    float64
 2   sex     1313 non-null   object 
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None


Unnamed: 0,pclass,age,sex
count,1313,633.0,1313
unique,3,,2
top,3rd,,male
freq,711,,850
mean,,31.194181,
std,,14.747525,
min,,0.1667,
25%,,21.0,
50%,,30.0,
75%,,41.0,


In [30]:
# 分割数据集到训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
print(x_train.head())

#性别是女性的数量
x_train[x_train['sex'] == 'female'].count()

    pclass   age     sex
598    2nd  30.0    male
246    1st  62.0    male
905    3rd   NaN  female
300    1st   NaN  female
509    2nd  64.0    male


pclass    341
age       180
sex       341
dtype: int64

In [31]:
#女性中存活的情况对比
z=x_train.copy() #z是为了把特征和目标存储到一起
z['survived'] = y_train #把目标值存储到z中
z[z['sex'] == 'female']['survived'].value_counts()  #男性中存活的情况

survived
1    230
0    111
Name: count, dtype: int64

In [32]:
x_train.to_dict(orient="records") #把df变为字典，样本变为一个一个的字典，字典中列名变为键

[{'pclass': '2nd', 'age': 30.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 62.0, 'sex': 'male'},
 {'pclass': '3rd', 'age': nan, 'sex': 'female'},
 {'pclass': '1st', 'age': nan, 'sex': 'female'},
 {'pclass': '2nd', 'age': 64.0, 'sex': 'male'},
 {'pclass': '1st', 'age': nan, 'sex': 'female'},
 {'pclass': '3rd', 'age': 24.0, 'sex': 'female'},
 {'pclass': '3rd', 'age': nan, 'sex': 'male'},
 {'pclass': '2nd', 'age': nan, 'sex': 'male'},
 {'pclass': '3rd', 'age': nan, 'sex': 'male'},
 {'pclass': '3rd', 'age': 21.0, 'sex': 'male'},
 {'pclass': '3rd', 'age': nan, 'sex': 'male'},
 {'pclass': '3rd', 'age': nan, 'sex': 'male'},
 {'pclass': '2nd', 'age': 23.0, 'sex': 'female'},
 {'pclass': '3rd', 'age': nan, 'sex': 'male'},
 {'pclass': '3rd', 'age': nan, 'sex': 'female'},
 {'pclass': '3rd', 'age': nan, 'sex': 'female'},
 {'pclass': '1st', 'age': 44.0, 'sex': 'female'},
 {'pclass': '3rd', 'age': nan, 'sex': 'female'},
 {'pclass': '3rd', 'age': nan, 'sex': 'male'},
 {'pclass': '3rd', 'age': nan, 'sex

In [33]:
# 进行处理（特征工程）特征-》类别-》one_hot编码
dict = DictVectorizer(sparse=False)

# 这一步是对字典进行特征抽取,to_dict可以把df变为字典，records代表列名变为键
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
print(type(x_train))
print(dict.get_feature_names_out())
print('-' * 50)
x_test = dict.transform(x_test.to_dict(orient="records"))
print(x_train)

<class 'numpy.ndarray'>
['age' 'pclass=1st' 'pclass=2nd' 'pclass=3rd' 'sex=female' 'sex=male']
--------------------------------------------------
[[30.  0.  1.  0.  0.  1.]
 [62.  1.  0.  0.  0.  1.]
 [nan  0.  0.  1.  1.  0.]
 ...
 [34.  0.  1.  0.  0.  1.]
 [46.  1.  0.  0.  0.  1.]
 [nan  0.  0.  1.  0.  1.]]


In [34]:
# 用决策树进行预测，修改max_depth试试,修改criterion为entropy
#树过于复杂，就会产生过拟合
dec = DecisionTreeClassifier()

#训练
dec.fit(x_train, y_train)

# 预测准确率
print("预测的准确率：", dec.score(x_test, y_test))

# 导出决策树的结构
export_graphviz(dec, out_file="tree.dot",
                feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'female', 'male'])


预测的准确率： 0.8085106382978723


# 对决策树进行参数调优

In [39]:
#调整决策树的参数
# 分割数据集到训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
# 进行处理（特征工程）特征-》类别-》one_hot编码
dict = DictVectorizer(sparse=False)

# 这一步是对字典进行特征抽取
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
x_test = dict.transform(x_test.to_dict(orient="records"))

# print(x_train)
# # 用决策树进行预测，修改max_depth为10，发现提升了,min_impurity_decrease带来的增益要大于0.01才会进行划分
dec = DecisionTreeClassifier(max_depth=10,min_impurity_decrease=0.01,min_samples_split=20)

dec.fit(x_train, y_train)
#
# # 预测准确率
print("预测的准确率：", dec.score(x_test, y_test))
#
# # 导出决策树的结构
export_graphviz(dec, out_file="tree1.dot",
                feature_names=dict.get_feature_names_out())

预测的准确率： 0.8206686930091185


## 4 随机森林分类器

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
# 进行处理（特征工程）特征-》类别-》one_hot编码
dict = DictVectorizer(sparse=False)

# 这一步是对字典进行特征抽取
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
x_test = dict.transform(x_test.to_dict(orient="records"))

In [None]:
# 随机森林进行预测 （超参数调优），n_jobs充分利用多核的一个参数
rf = RandomForestClassifier(n_jobs=-1)
# 120, 200, 300, 500, 800, 1200,n_estimators森林中决策树的数目，也就是分类器的数目
# max_samples  是最大样本数
#bagging类型
param = {"n_estimators": [1500,2000, 5000], "max_depth": [2, 3, 5, 8, 15, 25]}

# 网格搜索与交叉验证
gc = GridSearchCV(rf, param_grid=param, cv=3)

gc.fit(x_train, y_train)

print("准确率：", gc.score(x_test, y_test))

print("查看选择的参数模型：", gc.best_params_)

print("选择最好的模型是：", gc.best_estimator_)

print("每个超参数每次交叉验证的结果：", gc.cv_results_)