In [1]:
import numpy as np
import pandas as pd

import datetime
from datetime import datetime, date

In [19]:
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), 
               index=['a', 'b', 'c', 'd', 'e'], 
               columns=['c1', 'c2', 'c3'])
df

Unnamed: 0,c1,c2,c3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


In [20]:
df2 = df
df2['c4'] = np.nan
df2.loc['f'] = np.arange(15, 19) 
df2.loc['g'] = np.nan
df2['c5'] = np.nan
df2['c4']['a'] = 20
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [21]:
df2.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [22]:
df2.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [23]:
df2.isnull().sum().sum()

15

In [24]:
df2.count()

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [25]:
df2.dropna()

Unnamed: 0,c1,c2,c3,c4,c5


In [27]:
df2.c4.dropna()

a    20.0
f    18.0
Name: c4, dtype: float64

In [33]:
df2.dropna(how = 'all')  #drop g row

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,


In [34]:
df2.dropna(how = 'all', axis = 1)    #drop c5 col

Unnamed: 0,c1,c2,c3,c4
a,0.0,1.0,2.0,20.0
b,3.0,4.0,5.0,
c,6.0,7.0,8.0,
d,9.0,10.0,11.0,
e,12.0,13.0,14.0,
f,15.0,16.0,17.0,18.0
g,,,,


In [35]:
# only drop columns with at least 5 NaN values
df2.dropna(thresh=5, axis=1)

Unnamed: 0,c1,c2,c3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,12.0,13.0,14.0
f,15.0,16.0,17.0
g,,,


In [36]:
# create a NumPy array with one NaN value
a = np.array([1, 2, np.nan, 3])
# create a Series from the array
s = pd.Series(a)
# the mean of each is different
a.mean(), s.mean()

(nan, 2.0)

In [39]:
df2.fillna(100)

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,100.0
b,3.0,4.0,5.0,100.0,100.0
c,6.0,7.0,8.0,100.0,100.0
d,9.0,10.0,11.0,100.0,100.0
e,12.0,13.0,14.0,100.0,100.0
f,15.0,16.0,17.0,18.0,100.0
g,100.0,100.0,100.0,100.0,100.0


In [40]:
df2.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [41]:
df2.c4.fillna(method='ffill')

a    20.0
b    20.0
c    20.0
d    20.0
e    20.0
f    18.0
g    18.0
Name: c4, dtype: float64

In [42]:
df2.c4.fillna(method='bfill')

a    20.0
b    18.0
c    18.0
d    18.0
e    18.0
f    18.0
g     NaN
Name: c4, dtype: float64

In [43]:
fill_values = pd.Series([100, 101, 102], index=['a', 'e', 'g'])
df2.c4.fillna(fill_values)#可指定fill value，a就沒被fill

a     20.0
b      NaN
c      NaN
d      NaN
e    101.0
f     18.0
g    102.0
Name: c4, dtype: float64

In [44]:
df.fillna(df.mean())

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,19.0,
c,6.0,7.0,8.0,19.0,
d,9.0,10.0,11.0,19.0,
e,12.0,13.0,14.0,19.0,
f,15.0,16.0,17.0,18.0,
g,7.5,8.5,9.5,19.0,


In [45]:
# interpolate
# linear interpolate the NaN values from 1 through 2
s = pd.Series([1, np.nan, np.nan, np.nan, 2])
s.interpolate()

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
dtype: float64

In [46]:
ts = pd.Series([1, np.nan, 2], 
            index=[datetime(2014, 1, 1), 
                   datetime(2014, 2, 1),                   
                   datetime(2014, 4, 1)])
ts

2014-01-01    1.0
2014-02-01    NaN
2014-04-01    2.0
dtype: float64

In [47]:
ts.interpolate()

2014-01-01    1.0
2014-02-01    1.5
2014-04-01    2.0
dtype: float64

In [48]:
ts.interpolate(method='time')

2014-01-01    1.000000
2014-02-01    1.344444
2014-04-01    2.000000
dtype: float64

In [49]:
s = pd.Series([0, np.nan, 100], index=[0, 1, 10])
s

0       0.0
1       NaN
10    100.0
dtype: float64

In [50]:
s.interpolate()

0       0.0
1      50.0
10    100.0
dtype: float64

In [51]:
s.interpolate(method='values')

0       0.0
1      10.0
10    100.0
dtype: float64