In [2]:
import numpy as np
import pandas as pd

In [3]:
# 创建随机数生成器
rng = np.random.default_rng(seed=1234)

# 7.1 处理缺失值，哨兵值

In [4]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [5]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data = pd.Series(['aardvark', np.nan, None, 'avocado'])

string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [8]:
float_data = pd.DataFrame(rng.standard_normal((3,3)), 
                          dtype='float64', 
                          columns=['a', 'b', 'c'])

float_data

Unnamed: 0,a,b,c
0,-1.603837,0.0641,0.740891
1,0.152619,0.863744,2.913099
2,-1.478823,0.945473,-1.666135


In [9]:
float_data.iloc[:2, 2] = np.nan
float_data.isna()

Unnamed: 0,a,b,c
0,False,False,True
1,False,False,True
2,False,False,False


In [10]:
float_data.loc[[0,2] , ['a', 'b']] = np.nan # loc() 根据标签索引
float_data.isna()

Unnamed: 0,a,b,c
0,True,True,True
1,False,False,True
2,True,True,False


## 过滤缺失值

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

data.dropna() # 过滤缺失值

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notna()] # =data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
dataf = pd.DataFrame(rng.standard_normal((4,3)))
dataf.iloc[1:3, 1:] = np.nan
dataf.iloc[2:, 0] =np.nan
dataf

Unnamed: 0,0,1,2
0,0.343745,-0.512444,1.323759
1,-0.86028,,
2,,,
3,,-1.002166,0.268346


In [14]:
dataf.dropna()

Unnamed: 0,0,1,2
0,0.343745,-0.512444,1.323759


In [15]:
dataf.dropna(how='all')
#    - `'any'` (默认)："零容忍"策略 - 确保每个保留的观测都是完整的
#   - `'all'`："最大限度保留"策略 - 只删除完全没有信息的行/列

Unnamed: 0,0,1,2
0,0.343745,-0.512444,1.323759
1,-0.86028,,
3,,-1.002166,0.268346


In [16]:
dataf[4] = np.nan
dataf

Unnamed: 0,0,1,2,4
0,0.343745,-0.512444,1.323759,
1,-0.86028,,,
2,,,,
3,,-1.002166,0.268346,


In [17]:
dataf.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,0.343745,-0.512444,1.323759
1,-0.86028,,
2,,,
3,,-1.002166,0.268346


In [20]:
df = pd.DataFrame(rng.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

df

Unnamed: 0,0,1,2
0,-0.163806,,
1,-1.17415,,
2,-0.136644,,-0.088218
3,1.557242,,0.522726
4,0.93715,-0.836909,0.098068
5,-1.570553,-1.779878,0.918836
6,-0.149068,1.005636,0.131018


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,0.93715,-0.836909,0.098068
5,-1.570553,-1.779878,0.918836
6,-0.149068,1.005636,0.131018


In [None]:
df.dropna(thresh=2) # threshold = 2 设置一个阈值

Unnamed: 0,0,1,2
2,-0.136644,,-0.088218
3,1.557242,,0.522726
4,0.93715,-0.836909,0.098068
5,-1.570553,-1.779878,0.918836
6,-0.149068,1.005636,0.131018


## 补充缺失值

In [24]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.163806,0.0,0.0
1,-1.17415,0.0,0.0
2,-0.136644,0.0,-0.088218
3,1.557242,0.0,0.522726
4,0.93715,-0.836909,0.098068
5,-1.570553,-1.779878,0.918836
6,-0.149068,1.005636,0.131018


用字典调用 `fillna` 时，你可以为每列使用不同的填充值：

In [25]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.163806,0.5,0.0
1,-1.17415,0.5,0.0
2,-0.136644,0.5,-0.088218
3,1.557242,0.5,0.522726
4,0.93715,-0.836909,0.098068
5,-1.570553,-1.779878,0.918836
6,-0.149068,1.005636,0.131018


用于`.reindex()`的插值方法（见表 5.3）也可以用于 `fillna`：

In [26]:
df = pd.DataFrame(rng.standard_normal((6,3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan

df

Unnamed: 0,0,1,2
0,-0.773047,2.894307,1.377078
1,0.171456,0.022242,1.652686
2,-0.321875,,0.654605
3,-1.321826,,1.117381
4,0.546459,,
5,-0.491283,,


In [None]:
df.fillna(method='ffill') # 将过时

  df.fillna(method='ffill')


Unnamed: 0,0,1,2
0,-0.773047,2.894307,1.377078
1,0.171456,0.022242,1.652686
2,-0.321875,0.022242,0.654605
3,-1.321826,0.022242,1.117381
4,0.546459,0.022242,1.117381
5,-0.491283,0.022242,1.117381


In [28]:
df.ffill()

Unnamed: 0,0,1,2
0,-0.773047,2.894307,1.377078
1,0.171456,0.022242,1.652686
2,-0.321875,0.022242,0.654605
3,-1.321826,0.022242,1.117381
4,0.546459,0.022242,1.117381
5,-0.491283,0.022242,1.117381


In [None]:
df.ffill(limit=1) # 填充数 = limit value

Unnamed: 0,0,1,2
0,-0.773047,2.894307,1.377078
1,0.171456,0.022242,1.652686
2,-0.321875,0.022242,0.654605
3,-1.321826,,1.117381
4,0.546459,,1.117381
5,-0.491283,,


In [54]:
print('dataframe 切片成 Series：')
print(df.median())
print(df.median().shape)
print('-------')
print('Series 按广播机制默认匹配 dataframe 的 colums => col1 = 1.458275， col2 = 1.247230')
print(df.fillna(df.median())) # 默认 axis =0

dataframe 切片成 Series：
0   -0.406579
1    1.458275
2    1.247230
dtype: float64
(3,)
-------
Series 按广播机制默认匹配 dataframe 的 colums => col1 = 1.458275， col2 = 1.247230
          0         1         2
0 -0.773047  2.894307  1.377078
1  0.171456  0.022242  1.652686
2 -0.321875  1.458275  0.654605
3 -1.321826  1.458275  1.117381
4  0.546459  1.458275  1.247230
5 -0.491283  1.458275  1.247230


In [None]:
print(df.median(axis=1))
print(df.median(axis=1).shape) # 压缩掉了 axis=1，剩下 shape （6， ）
print('-------')
print(df.fillna(df.median(axis=1), axis=0)) # 默认 axis=0 对齐，fillna 暂不支持 axis >0 高维运算

0    1.377078
1    0.171456
2    0.166365
3   -0.102223
4    0.546459
5   -0.491283
dtype: float64
(6,)
-------
          0         1         2
0 -0.773047  2.894307  1.377078
1  0.171456  0.022242  1.652686
2 -0.321875  0.171456  0.654605
3 -1.321826  0.171456  1.117381
4  0.546459  0.171456  0.166365
5 -0.491283  0.171456  0.166365


# 数据转换 data transformation

## 删除重复内容

In [40]:
data = pd.DataFrame({'k1': ['one', 'two']*3+['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})

data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [53]:
data.duplicated() 
# 因为row 5, 6 的 k1, k2 列相同

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [52]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [56]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [None]:
data.drop_duplicates(subset=['k1']) # 默认保留第一个观察到的值组合

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [None]:
data.drop_duplicates(subset=['k1'], keep='last') # keep="last" will return the last one

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [59]:
data.drop_duplicates(subset=['k1', 'k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


## 使用函数或映射mapping转换数据