# 清除示例 II
我们先练习癌症数据缺失值和重复值的处理。

In [21]:
import pandas as pd
import numpy as np

## 清除缺失值的具体步骤：

## 1 获取概览

In [22]:
# 读入 `cancer_data.csv`
df=pd.read_csv('cancer_data.csv')
# 用 info() 检查哪些列有缺失值
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                        569 non-null int64
diagnosis                 569 non-null object
radius_mean               569 non-null float64
texture_mean              548 non-null float64
perimeter_mean            569 non-null float64
area_mean                 569 non-null float64
smoothness_mean           521 non-null float64
compactness_mean          569 non-null float64
concavity_mean            569 non-null float64
concave_points_mean       569 non-null float64
symmetry_mean             504 non-null float64
fractal_dimension_mean    569 non-null float64
radius_SE                 569 non-null float64
texture_SE                548 non-null float64
perimeter_SE              569 non-null float64
area_SE                   569 non-null float64
smoothness_SE             521 non-null float64
compactness_SE            569 non-null float64
concavity_SE              569 non-null float64
conca

## 2 使用`mean()`的方式获取均值，并且对每一个存在空值的列使用`fillna`的方式用均值进行填充

使用`df['列明'].mean()`的方式获取该列的均值  
并通过`df['xx'].fillna(mean)`的方式向该列中的空值填充平均值  
注意`.fillna`会生成一个新的df的列，默认是不会更改现有的df  
可以使用`fillna(xx,inplace=True)`的方式替换

### 解决办法1：

In [None]:
# 用均值填充缺失值
# 注意使用以下方式比较愚蠢
# 不适用以下方式
mean_texture=df['texture_mean'].mean()
df['texture_mean'].fillna(mean_texture,inplace=True)
mean_smoothness=df['smoothness_mean'].mean()
df['smoothness_mean'].fillna(mean_smoothness,inplace=True)
mean_symmetry=df['symmetry_mean'].mean()
df['symmetry_mean'].fillna(mean_symmetry,inplace=True)

df['texture_SE'].fillna(df['texture_SE'].mean(),inplace=True)
df['symmetry_SE'].fillna(df['symmetry_SE'].mean(),inplace=True)
df['texture_max'].fillna(df['texture_max'].mean(),inplace=True)

df['smoothness_SE'].fillna(df['smoothness_SE'].mean(),inplace=True)
df['smoothness_max'].fillna(df['smoothness_max'].mean(),inplace=True)
df['symmetry_max'].fillna(df['symmetry_max'].mean(),inplace=True)

# 用 info() 确认修改
df.info()

### 解决办法2

下面比较优选的方式是  
* 获取每一列为null的list  
* 获取每一个包含null的bool集合  
* 将bool集合转成list  
* 对每一个包含null值得列进行均值填充

In [15]:
df.isnull().sum().head()

id                 0
diagnosis          0
radius_mean        0
texture_mean      21
perimeter_mean     0
dtype: int64

In [17]:
(df.isnull().sum()>0).head()

id                False
diagnosis         False
radius_mean       False
texture_mean       True
perimeter_mean    False
dtype: bool

In [18]:
df.columns[df.isnull().sum()>0]

Index(['texture_mean', 'smoothness_mean', 'symmetry_mean', 'texture_SE',
       'smoothness_SE', 'symmetry_SE', 'texture_max', 'smoothness_max',
       'symmetry_max'],
      dtype='object')

In [19]:
list(df.columns[df.isnull().sum()>0])

['texture_mean',
 'smoothness_mean',
 'symmetry_mean',
 'texture_SE',
 'smoothness_SE',
 'symmetry_SE',
 'texture_max',
 'smoothness_max',
 'symmetry_max']

In [23]:
for column in list(df.columns[df.isnull().sum()>0]):
    mean_value=df[column].mean()
    df[column].fillna(mean_value,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                        569 non-null int64
diagnosis                 569 non-null object
radius_mean               569 non-null float64
texture_mean              569 non-null float64
perimeter_mean            569 non-null float64
area_mean                 569 non-null float64
smoothness_mean           569 non-null float64
compactness_mean          569 non-null float64
concavity_mean            569 non-null float64
concave_points_mean       569 non-null float64
symmetry_mean             569 non-null float64
fractal_dimension_mean    569 non-null float64
radius_SE                 569 non-null float64
texture_SE                569 non-null float64
perimeter_SE              569 non-null float64
area_SE                   569 non-null float64
smoothness_SE             569 non-null float64
compactness_SE            569 non-null float64
concavity_SE              569 non-null float64
conca

## 消除重复值

## 消除重复值的步骤：  
### 1 通过`duplicated()`获取重复项

在pandas中可以通过`duplicated()`获取重复的项  
使用`drop_duplicates(inplace=True)`的方式剔除掉重复的项

In [5]:
# 检查数据中的重复
df.duplicated()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
539    False
540    False
541    False
542    False
543    False
544    False
545    False
546    False
547    False
548    False
549    False
550    False
551    False
552    False
553    False
554    False
555    False
556    False
557    False
558     True
559    False
560    False
561    False
562    False
563    False
564    False
565    False
566    False
567    False
568    False
Length: 569, dtype: bool

In [6]:
# 丢弃重复
df.drop_duplicates(inplace=True)

In [7]:
# 再次检查数据中的重复，确认修改
df.duplicated()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
538    False
539    False
540    False
541    False
542    False
543    False
544    False
545    False
546    False
547    False
548    False
549    False
550    False
551    False
552    False
553    False
554    False
555    False
556    False
557    False
559    False
560    False
561    False
562    False
563    False
564    False
565    False
566    False
567    False
568    False
Length: 564, dtype: bool

## 3 重命名列
由于之前修改了数据集，使其仅包括肿瘤特征的均值，因此每个特征末尾好像不需要 "_mean" 。而且，稍后输入分析还要多耗费时间。我们现在想一些要分配给列的新标签。

In [8]:
# 从列名称中移除 "_mean"
new_labels = []
for col in df.columns:
    if '_mean' in col:
        new_labels.append(col[:-5])  # 不包括最后 6 个字符
    else:
        new_labels.append(col)

# 列的新标签
new_labels

['id',
 'diagnosis',
 'radius',
 'texture',
 'perimeter',
 'area',
 'smoothness',
 'compactness',
 'concavity',
 'concave_points',
 'symmetry',
 'fractal_dimension',
 'radius_SE',
 'texture_SE',
 'perimeter_SE',
 'area_SE',
 'smoothness_SE',
 'compactness_SE',
 'concavity_SE',
 'concave_points_SE',
 'symmetry_SE',
 'fractal_dimension_SE',
 'radius_max',
 'texture_max',
 'perimeter_max',
 'area_max',
 'smoothness_max',
 'compactness_max',
 'concavity_max',
 'concave_points_max',
 'symmetry_max',
 'fractal_dimension_max']

In [9]:
# 为数据框中的列分配新标签
df.columns = new_labels

# 显示数据框的前几行，确认更改
df.head()

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,...,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_points_max,symmetry_max,fractal_dimension_max
0,842302,M,17.99,19.293431,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,25.660803,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.096087,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.13209,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [10]:
# 将其保存，供稍后使用
df.to_csv('cancer_data_edited.csv', index=False)

### 备注

使用`new_df=pd.to_datetime(df['xx'])`的方式将指定列转为datetime

注意对于已经转换为date的列，若通过to_csv的方式转换后，再次读取csv文件**该date类型的列还是str类型（obj）**  
需要再次转换