In [1]:
# 先导入数据和库
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 一、数据清洗简述
####  我们拿到的数据通常是不干净的，所谓的不干净，就是数据中有缺失值，有一些异常点等，需要经过一定的处理才能继续做后面的分析或建模，所以拿到数据的第一步是进行数据清洗，本章我们将学习缺失值、重复值、字符串和数据转换等操作，将数据清洗成可以分析或建模的样子。

### 1.1 缺失值观察与处理
#### 我们拿到的数据经常会有很多缺失值，比如我们可以看到Cabin列存在NaN，那其他列还有没有缺失值，这些缺失值要怎么处理呢

#### 1.1.1 缺失值观察
##### (1) 请查看每个特征缺失值个数
##### (2) 请查看Age， Cabin， Embarked列的数据 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.isnull().head()  #判断是否有缺失值  True 则为 缺失值

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False


In [9]:
df.isnull().sum()   # 统计缺失值的个数

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
# 结合 any（） 来判断某列是否有缺失值  默认 axis = 0  某列   any() 若有一个 缺失值 NaN 则返回 True
df.isnull().any()    # 可与  df.info()  df.isnull().sum() 共同观察

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [15]:
df.isnull().any(axis = 1)  # 判断某行是否有缺失值， 猜测使用较少

0       True
1      False
2       True
3      False
4       True
       ...  
886     True
887    False
888     True
889    False
890     True
Length: 891, dtype: bool

In [17]:
df.isnull().values.any()

True

In [20]:
# 2）查看Age， Cabin， Embarked列的数据
df[['Age','Cabin','Embarked']].head()

Unnamed: 0,Age,Cabin,Embarked
0,22.0,,S
1,38.0,C85,C
2,26.0,,S
3,35.0,C123,S
4,35.0,,S


#### 1.1.2 对缺失值进行处理   （常见的有 丢弃dropna( )和填充 fillna( )）

In [23]:
df.Age.isnull().any()

True

In [25]:
#  定位缺失值所在
df[df.isnull().values == True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [26]:
#  找出某列数据是缺失值 的整体数据
df[df.Age.isnull().values == True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [27]:
#尝试课程例子
df[df['Age']==None]=0
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [28]:
df[df['Age'].isnull()]=0   #将 Age 是缺失值的所在行数据  全都赋值为0 
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0,0,0,0,0,0.0,0,0,0,0.0,0,0


In [29]:
df[df['Age'] == np.nan]=0    # 同上，  三个例子 发现  在numpy pandas中  None 和 缺失值 是不同的数据类型
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0,0,0,0,0,0.0,0,0,0,0.0,0,0


In [31]:
# 丢弃缺失值
df.dropna().head(6) # 将有缺失值的 整行数据直接丢弃,默认 inplace = False 没有修改 原数据 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,0,0,0,0,0.0,0,0,0,0.0,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [32]:
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0,0,0,0,0,0.0,0,0,0,0.0,0,0


In [33]:
df.fillna(0).head(6)  #  fillna填充：将缺失值填充为 0 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S
5,0,0,0,0,0,0.0,0,0,0,0.0,0,0


In [5]:
df_e = pd.DataFrame([[np.nan, 2, np.nan, 0],
                     [3, 4, np.nan, 1],
                     [np.nan, np.nan, np.nan, 5],
                     [1, 2, 3, 4]],
                     columns=list('ABCD'))
df_e

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,1.0,2.0,3.0,4


In [6]:
df_e.dropna()

Unnamed: 0,A,B,C,D
3,1.0,2.0,3.0,4


In [7]:
df_e.dropna(axis=1)

Unnamed: 0,D
0,0
1,1
2,5
3,4


In [8]:
df_e.dropna(how='all') #当某行全是NaN时，才会被删除

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,1.0,2.0,3.0,4


In [9]:
# thresh 参数
df_e.dropna(thresh=2)  # 保留至少有 2 个 非NaN 的行 

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
3,1.0,2.0,3.0,4


In [10]:
df_e.dropna(thresh=3)

Unnamed: 0,A,B,C,D
1,3.0,4.0,,1
3,1.0,2.0,3.0,4


In [12]:
df_e

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,1.0,2.0,3.0,4


In [11]:
# subset 参数
df_e.dropna(subset=['B','D'])   # 只查看 'B'、'D'列，若有缺失值NA则drop所在行

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
3,1.0,2.0,3.0,4


##### fillna( ) 函数

In [13]:
df_e

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,1.0,2.0,3.0,4


In [14]:
# value = None
df_e.fillna(0)  # 将缺失值 填充为 0

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,1.0,2.0,3.0,4


In [15]:
# value = None 针对性填充  
# 将“ A”，“ B”，“ C”和“ D”列中的所有NaN元素分别替换为0、1、2和3
values = {'A':0,'B':1,'C':2,'D':3}
df_e.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5
3,1.0,2.0,3.0,4


In [16]:
# method = ffill / bfill
df_e

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,1.0,2.0,3.0,4


In [17]:
df_e.fillna(method='ffill')    # 前一个有效数值来填充缺失值  即 缺失值向前取一个有效值  

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,1.0,2.0,3.0,4


In [18]:
df_e.fillna(method='bfill') # 后一个有效值来填充缺失值，即 缺失值向后取有效值 

Unnamed: 0,A,B,C,D
0,3.0,2.0,3.0,0
1,3.0,4.0,3.0,1
2,1.0,2.0,3.0,5
3,1.0,2.0,3.0,4


In [20]:
# limit 参数 
df_e

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,1.0,2.0,3.0,4


In [19]:
# 沿 列轴 仅 替换 第1个缺失值
df_e.fillna(0,limit=1)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,,1
2,,0.0,,5
3,1.0,2.0,3.0,4


In [21]:
df_e.fillna(99,limit=2)

Unnamed: 0,A,B,C,D
0,99.0,2.0,99.0,0
1,3.0,4.0,99.0,1
2,99.0,99.0,,5
3,1.0,2.0,3.0,4


### 2.2 重复值观察与处理
##### 2.2.1  请查看数据中的重复值

In [22]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
df.duplicated().head(6)  #查看是否有重复值 还有 df.duplicated(subset=[])的用法，查看某列是否有重复值

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [25]:
df[df.duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [26]:
#  举例：
df_e2 = pd.DataFrame([[1,2,3,4],
                      [1,4,5,6],
                      [1,2,3,4],
                      [1,5,6,7]],
                     columns=list('ABCD'))
df_e2    # 例子既要有 行有重复值 又要有列有重复值，便于区分

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
2,1,2,3,4
3,1,5,6,7


In [27]:
df_e2.duplicated()  # 结果说明 行索引为2的是与其他重复  即行索引的数值相同的为重复值

0    False
1    False
2     True
3    False
dtype: bool

In [28]:
df_e2[df_e2.duplicated()]   # 找出重复行！！ 所以 df.duplicated( )函数 返回的是行的重复情况

Unnamed: 0,A,B,C,D
2,1,2,3,4


#### 2.2.2 对重复值进行处理  （重复值指的是 行数值相同，谨记！！）
#####  (1)重复值有哪些处理方式呢？ 
#####  (2)处理我们数据的重复值

In [29]:
# 举例处理
df_e2

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
2,1,2,3,4
3,1,5,6,7


In [30]:
df_e2.drop_duplicates()  
# drop_duplicates() 函数 发现丢弃了与之前出现过的重复值，即保留的第一个 (keep参数，默认keep = first)

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
3,1,5,6,7


In [32]:
# drop_duplicates( ) 函数 及参数
df_e2

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
2,1,2,3,4
3,1,5,6,7


In [33]:
df_e2.drop_duplicates(['B'])  # 以'B'列为基准， 根据'B'列来过滤重复值，发现 过滤了第二次出现的'2'所在行

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
3,1,5,6,7


In [34]:
# keep 参数  first or  last  默认是 first 即保留第一次出现的重复值
df_e2

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
2,1,2,3,4
3,1,5,6,7


In [35]:
df_e2.drop_duplicates(keep='first') # 默认

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,4,5,6
3,1,5,6,7


In [36]:
df_e2.drop_duplicates(keep='last')

Unnamed: 0,A,B,C,D
1,1,4,5,6
2,1,2,3,4
3,1,5,6,7


#### 2.2.3 保存数据 --- df.to_csv('文件名')

### 2.3 特征观察与处理