In [9]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

In [2]:
# 导入数据
titanic = pd.read_csv("data/titanic.csv")
titanic.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


In [3]:
# 数据处理
    # 筛选对target(生存率)有影响的features
features_pre = titanic[["sex","age","class"]]
targets_pre = titanic[["alive"]]
    # 把"age"特征值中的nan项用平均值填充
print(features_pre.info())
features_pre["age"].fillna(int(features_pre["age"].mean()),inplace=True)
print(features_pre.info())
features_pre.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sex     891 non-null    object 
 1   age     714 non-null    float64
 2   class   891 non-null    object 
dtypes: float64(1), object(2)
memory usage: 21.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sex     891 non-null    object 
 1   age     891 non-null    float64
 2   class   891 non-null    object 
dtypes: float64(1), object(2)
memory usage: 21.0+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,sex,age,class
0,male,22.0,Third
1,female,38.0,First


In [4]:
# train_test_split
x_train,x_test,y_train,y_test = train_test_split(features_pre,targets_pre,train_size=0.75)

In [5]:
# sklearn.feature_extration.DictVectorizer 的教学
fruits=[{"fruit":"苹果", "price":5}, {"fruit":"橘子","price":5.9}, {"fruit":"波萝","price":9.9}]
vect = sklearn.feature_extraction.DictVectorizer()
fruits_transformed = vect.fit_transform(fruits)
print(vect.get_feature_names())
print(type(fruits_transformed))
fruits_transformed.toarray()

['fruit=橘子', 'fruit=波萝', 'fruit=苹果', 'price']
<class 'scipy.sparse.csr.csr_matrix'>


array([[0. , 0. , 1. , 5. ],
       [1. , 0. , 0. , 5.9],
       [0. , 1. , 0. , 9.9]])

In [6]:
# sklearn.feature_extration.DictVectorizer 的应用
    # 首先先将x_train,x_test转变为Dict格式
x_train = x_train.to_dict(orient="records")
x_test = x_test.to_dict(orient="records")
    # 将train和test的data用DictVectorizer处理
vect = DictVectorizer()
vect.fit(x_train)
x_train_transform = vect.transform(x_train)
x_test_transform = vect.transform(x_test)
print(vect.get_feature_names())
x_test_transform.toarray()

['age', 'class=First', 'class=Second', 'class=Third', 'sex=female', 'sex=male']


array([[61.,  0.,  0.,  1.,  0.,  1.],
       [39.,  1.,  0.,  0.,  1.,  0.],
       [19.,  0.,  0.,  1.,  0.,  1.],
       ...,
       [36.,  0.,  1.,  0.,  1.,  0.],
       [42.,  0.,  1.,  0.,  1.,  0.],
       [40.,  0.,  1.,  0.,  1.,  0.]])

In [7]:
# 决策树训练及评分
dtc = DecisionTreeClassifier()
dtc.fit(x_train_transform,y_train)
dtc.score(x_test_transform,y_test)

0.8251121076233184

# DecisionTreeClassifier()参数
1. criterion:
gini 或者entropy(信息熵) 的方式。
2. splitter :
best 或 random , best 是在所有特种中找最好的切分点， random 是随机的找一些特征来进行切分（数据量大的时候用 random)
3. max_depth：树的最大深度。当特征或者数据量比较小的时候可以不用管这个值。特征比较多的时候可以尝试限制一下。
4. min_samples_split：决策树中某个叶子节点的样本最小个数。如果数据量不大，不需要管这个值，如果样本量比较大，则推荐增大这个值。
5. min_ weight_ fraction_ 1eaf：叶子节点所有样本权重和的最小值。如果小于这个值，则会和兄弟节点一起被剪枝，默认是0，也就是不考志权重的问题。一般来说，如果我们有较多样有缺失值，或者分类树样本的分布类别偏差很大，就会引入样本权重，这时我们就要注意这个值了。
6. max_leaf_nodes：最大的叶子节点的个数。默认是None，即不限制叶子节点的个数。如果设買了这个值，那么在决策树建立的过程中优化叶子节点的个数。如果特征不多，可以不考虑这个值，但是如果特征分多的话，可以加以限制。
7. class weieht指定样木名特征的权重，主要是为了方式某此特征的样木过多导致偏向这些特征默认是 balance ，也就是算法会自动的调节权重。
8. min imouritv_decrease：最小的不纯度（基尼系数、信息增益等）。如果小于这个数，那么就不会再往下生成叶子节点了


In [12]:
# 决策树的绘制 class_names从小到大排列
export_graphviz(dtc,"tree.dot",feature_names=['age', 'class=First', 'class=Second', 'class=Third', 'sex=female', 'sex=male'],class_names=["no","yes"])

In [8]:
# 用K近邻方法训练
    # 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_st = scaler.fit_transform(x_train_transform.toarray())
x_test_st = scaler.fit_transform(x_test_transform.toarray())
    #  训练
knn = sklearn.neighbors.KNeighborsClassifier()
# knn.fit(x_train_st,y_train)
knn.fit(x_train_st,np.array(y_train.reset_index()["alive"]))
knn.score(x_test_st,np.array(y_test.reset_index()["alive"]))

0.8116591928251121