In [1]:
import numpy as np
import pandas as pd

## 一、缺失值的统计和删除
## 1.缺失信息的统计

In [3]:
df = pd.read_csv('learn_pandas.csv',
                 usecols=['Grade','Name','Gender','Height','Weight','Transfer'])
df.head()

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
0,Freshman,Gaopeng Yang,Female,158.9,46.0,N
1,Freshman,Changqiang You,Male,166.5,70.0,N
2,Senior,Mei Sun,Male,188.9,89.0,N
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
4,Sophomore,Gaojuan You,Male,174.0,74.0,N


In [4]:
df.isna().head() # 判断是否为 缺失值，返回布尔值

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,False,False


In [5]:
df.isnull().mean() # 查看缺失的比例

Grade       0.000
Name        0.000
Gender      0.000
Height      0.085
Weight      0.055
Transfer    0.060
dtype: float64

In [6]:
df[df.Height.isna()].head()

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
12,Senior,Peng You,Female,,48.0,
26,Junior,Yanli You,Female,,48.0,N
36,Freshman,Xiaojuan Qin,Male,,79.0,Y
60,Freshman,Yanpeng Lv,Male,,65.0,N


In [9]:
sub_set = df[['Height','Weight','Transfer']]
df[sub_set.isna().all(1)]  # 横向上 身高、体重、转系情况 均为缺失值

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
102,Junior,Chengli Zhao,Male,,,


In [10]:
df[sub_set.isnull().any(1)]  #横向上，身高、体重、转系情况，存在一个缺失值的

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
9,Junior,Juan Xu,Female,164.8,,N
12,Senior,Peng You,Female,,48.0,
21,Senior,Xiaopeng Shen,Male,166.0,62.0,
26,Junior,Yanli You,Female,,48.0,N
36,Freshman,Xiaojuan Qin,Male,,79.0,Y
60,Freshman,Yanpeng Lv,Male,,65.0,N
61,Sophomore,Xiaopeng Qin,Male,172.8,,N
69,Junior,Chunquan Xu,Female,162.1,54.0,
76,Sophomore,Yanquan Lv,Male,174.6,,N


In [11]:
df[sub_set.notna().all(1)] #没有缺失值的

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
0,Freshman,Gaopeng Yang,Female,158.9,46.0,N
1,Freshman,Changqiang You,Male,166.5,70.0,N
2,Senior,Mei Sun,Male,188.9,89.0,N
4,Sophomore,Gaojuan You,Male,174.0,74.0,N
5,Freshman,Xiaoli Qian,Female,158.0,51.0,N
...,...,...,...,...,...,...
195,Junior,Xiaojuan Sun,Female,153.9,46.0,N
196,Senior,Li Zhao,Female,160.9,50.0,N
197,Senior,Chengqiang Chu,Female,153.9,45.0,N
198,Senior,Chengmei Shen,Male,175.3,71.0,N


## 2.缺失信息的删除

In [14]:
res = df.dropna(how = 'any',subset=['Height','Weight'])
res.shape

(174, 6)

In [15]:
df.dropna(axis=1,thresh=df.shape[0]-15) # df.shape[0]-15 == 185, 
                                        # 非缺失值的个数达到185才会被保留，否则删除
                                        # 这里 身高列 被删除 

Unnamed: 0,Grade,Name,Gender,Weight,Transfer
0,Freshman,Gaopeng Yang,Female,46.0,N
1,Freshman,Changqiang You,Male,70.0,N
2,Senior,Mei Sun,Male,89.0,N
3,Sophomore,Xiaojuan Sun,Female,41.0,N
4,Sophomore,Gaojuan You,Male,74.0,N
...,...,...,...,...,...
195,Junior,Xiaojuan Sun,Female,46.0,N
196,Senior,Li Zhao,Female,50.0,N
197,Senior,Chengqiang Chu,Female,45.0,N
198,Senior,Chengmei Shen,Male,71.0,N


In [16]:
res = df.loc[df[['Height','Weight']].notna().all(1)]
res.shape

(174, 6)

In [18]:
res = df.loc[:,~(df.isna().sum()>15)]
res.head()

Unnamed: 0,Grade,Name,Gender,Weight,Transfer
0,Freshman,Gaopeng Yang,Female,46.0,N
1,Freshman,Changqiang You,Male,70.0,N
2,Senior,Mei Sun,Male,89.0,N
3,Sophomore,Xiaojuan Sun,Female,41.0,N
4,Sophomore,Gaojuan You,Male,74.0,N


## 二、缺失值的填充和插值
## 1.利用 fillna 进行填充

In [19]:
s = pd.Series([np.nan,1,np.nan,np.nan,2,np.nan],index=list('aaabcd'))
s   

a    NaN
a    1.0
a    NaN
b    NaN
c    2.0
d    NaN
dtype: float64

In [20]:
s.fillna(method='ffill') # f = front 表示用前面的值来填充缺失值

a    NaN
a    1.0
a    1.0
b    1.0
c    2.0
d    2.0
dtype: float64

In [21]:
s.fillna(method='ffill',limit=1) # 用前面的值进行填充，连续出现的缺失，只填充一次

a    NaN
a    1.0
a    1.0
b    NaN
c    2.0
d    2.0
dtype: float64

In [22]:
s.fillna(s.mean()) #s.mean() = (1+2) / 2 ,s.mean() 为标量

a    1.5
a    1.0
a    1.5
b    1.5
c    2.0
d    1.5
dtype: float64

In [23]:
s.fillna({'a':100,'d':200})

a    100.0
a      1.0
a    100.0
b      NaN
c      2.0
d    200.0
dtype: float64

In [24]:
df.groupby('Grade')['Height'].transform(lambda x:x.fillna(x.mean())).head()

0    158.900000
1    166.500000
2    188.900000
3    163.075862
4    174.000000
Name: Height, dtype: float64

## 2.插值函数

## 三、Nullable类型
## 1.缺失记号及其缺陷

In [25]:
None == None

True

In [26]:
None == False 

False

In [27]:
None == []

False

In [28]:
None == ''

False

In [29]:
np.nan == np.nan

False

In [30]:
np.nan == None

False

In [31]:
np.nan == False

False

In [32]:
s1 = pd.Series([1,np.nan])
s1

0    1.0
1    NaN
dtype: float64

In [33]:
s2 = pd.Series([1,2])
s2

0    1
1    2
dtype: int64

In [34]:
s3 = pd.Series([1,np.nan])
s3

0    1.0
1    NaN
dtype: float64

In [35]:
s1 ==1

0     True
1    False
dtype: bool

In [36]:
s1.equals(s2)

False

In [37]:
s1.equals(s3)

True

In [38]:
pd.to_timedelta(['30s',np.nan])

TimedeltaIndex(['0 days 00:00:30', NaT], dtype='timedelta64[ns]', freq=None)

In [39]:
pd.to_datetime(['20200101',np.nan])

DatetimeIndex(['2020-01-01', 'NaT'], dtype='datetime64[ns]', freq=None)

In [40]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [41]:
pd.Series(['a','b'])  # 纯字符串 也会返回 object类型

0    a
1    b
dtype: object

In [42]:
pd.Series([1,'b'])

0    1
1    b
dtype: object