In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import urllib.request
import os
%matplotlib inline

# 1.读取数据集

In [3]:
 # 读取数据文件，结果为DataFrame格式
df_data = pd.read_excel("./data/titanic3.xls")


# 2.筛选提取字段

survival（是否生存）是标签字段，其他是候选特征字段

筛选提取需要的特征字段，去掉ticket，cabin等

![image-2.png](attachment:image-2.png)

In [4]:
# 筛选提取字段
selected_cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
selected_df_data=df_data[selected_cols]

# 3找出有null值的字段
Pandas判断缺失值一般采用 isnull()，生成所有数据的True／False矩阵

In [6]:
# 找出存在缺失值的"列"
selected_df_data.isnull().any()
selected_df_data.isnull().sum()
selected_df_data[selected_df_data.isnull().values==True]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
15,0,"Baumann, Mr. John D",1,male,,0,0,25.9250,S
37,1,"Bradley, Mr. George (""George Arthur Brayton"")",1,male,,0,0,26.5500,S
40,0,"Brewe, Dr. Arthur Jackson",1,male,,0,0,39.6000,C
46,0,"Cairns, Mr. Alexander",1,male,,0,0,31.0000,S
59,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",1,female,,0,0,27.7208,C
...,...,...,...,...,...,...,...,...,...
1293,0,"Williams, Mr. Howard Hugh ""Harry""",3,male,,0,0,8.0500,S
1297,0,"Wiseman, Mr. Phillippe",3,male,,0,0,7.2500,S
1302,0,"Yousif, Mr. Wazli",3,male,,0,0,7.2250,C
1303,0,"Yousseff, Mr. Gerious",3,male,,0,0,14.4583,C


# 4.填充null值

In [7]:
# 为缺失age记录填充值 设置为平均值
age_mean_value= selected_df_data['age'].mean()
selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)
# 为缺失fare记录填充值
fare_mean_value=selected_df_data['fare'].mean()
selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)
#  为缺失embarked记录填充值
selected_df_data['embarked']=selected_df_data['embarked'].fillna('S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df_data['embarked']=selected_df_data['embarked'

# 5.转换编码

In [8]:
#  性别sex由字符串转换为数字编码
selected_df_data['sex'] = selected_df_data['sex'].map({'female':0,'male':1}).astype(int)
#  港口embarked由字母表示转换为数字编码
selected_df_data['embarked'] = selected_df_data['embarked'].map({'C':0,'Q':1,'S':2}).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df_data['sex'] = selected_df_data['sex'].map({'female':0,'male':1}).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df_data['embarked'] = selected_df_data['embarked'].map({'C':0,'Q':1,'S':2}).astype(int)


# 6.删除name字段

drop不改变原有的df中的数据，而是返回另一个DataFrame来存放删除后的数据

axis = 1 表示删除列


In [9]:
#  删除name字段
selected_df_data = selected_df_data.drop(['name'],axis=1)   # axis=1表示删除列
# 显示前3行数据
selected_df_data[:3]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,1,0,29.0,0,0,211.3375,2
1,1,1,1,0.9167,1,2,151.55,2
2,0,1,0,2.0,1,2,151.55,2


# 7.分离特征值和标签值

In [10]:
# 转换为ndarray数组
ndarray_data = selected_df_data.values
# 后7列是特征值
features = ndarray_data[:,1:]
# 第0列是标签值
label = ndarray_data[:,0]
features[:3]
label[:3]

array([1., 1., 0.])

# 8.特征值标准化处理

In [11]:
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features = minmax_scale.fit_transform(features)
norm_features[:3]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 1.        ],
       [0.        , 0.        , 0.0229641 , 0.125     , 0.22222222,
        0.2958059 , 1.        ]])

# 9. 定义数据预处理函数

把前面数据预处理的命令全部封装到prepare_data函数中，方便后面调用

In [45]:
# 定义数据预处理函数
from sklearn import preprocessing
def prepare_data(df_data):
    df = df_data.drop(['name'],axis=1) # 删除姓名列
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean) # 为缺失age记录填充值
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean) # 为缺失fare记录填充值
    df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int) # 把sex值由字符串转换为数值
    df['embarked'] = df['embarked'].fillna('S') # 为缺失embarked记录填充值
    df['embarked'] = df['embarked'].map({'C':0,'Q':1,'S':2}).astype(int) # 把embarked值由字符串转换为数值
    
    ndarray_data = df.values # 转换为ndarray数组
    features = ndarray_data[:,1:] # 后7列是特征值
    label = ndarray_data[:,0] # 第0列是标签值
    # 特征值标准化
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    norm_features = minmax_scale.fit_transform(features)
    
    return norm_features,label