In [4]:
import numpy as np

In [2]:
import pandas as pd


In [5]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
                      columns=['one', 'two', 'three'])

In [6]:
df

Unnamed: 0,one,two,three
a,-1.728668,-0.224033,1.029731
c,-0.523587,-0.458819,-0.730857
e,0.528344,0.918918,1.002231
f,-0.322356,0.856316,-1.764641
h,1.191995,0.8717,-0.094862


In [7]:
df['four'] = 'bar'
df['five'] = df['one'] > 0
df

Unnamed: 0,one,two,three,four,five
a,-1.728668,-0.224033,1.029731,bar,False
c,-0.523587,-0.458819,-0.730857,bar,False
e,0.528344,0.918918,1.002231,bar,True
f,-0.322356,0.856316,-1.764641,bar,False
h,1.191995,0.8717,-0.094862,bar,True


In [8]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df2

Unnamed: 0,one,two,three,four,five
a,-1.728668,-0.224033,1.029731,bar,False
b,,,,,
c,-0.523587,-0.458819,-0.730857,bar,False
d,,,,,
e,0.528344,0.918918,1.002231,bar,True
f,-0.322356,0.856316,-1.764641,bar,False
g,,,,,
h,1.191995,0.8717,-0.094862,bar,True


In [9]:
# pandas method
pd.isnull(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [10]:
# series
df2['four'].notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [11]:
# dataframe
df2.isnull()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [13]:
None == None

True

In [14]:
np.nan == np.nan

False

In [15]:
# therefore, it might be misleading, only shows false
df['one'] == np.nan

a    False
c    False
e    False
f    False
h    False
Name: one, dtype: bool

In [16]:
# Datatimes
df2 = df.copy()
df2['timestamp'] = pd.Timestamp('20120101')
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,-1.728668,-0.224033,1.029731,bar,False,2012-01-01
c,-0.523587,-0.458819,-0.730857,bar,False,2012-01-01
e,0.528344,0.918918,1.002231,bar,True,2012-01-01
f,-0.322356,0.856316,-1.764641,bar,False,2012-01-01
h,1.191995,0.8717,-0.094862,bar,True,2012-01-01


In [17]:
df2.loc[['a','c','h'],['one','timestamp']] = np.nan
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,-0.224033,1.029731,bar,False,NaT
c,,-0.458819,-0.730857,bar,False,NaT
e,0.528344,0.918918,1.002231,bar,True,2012-01-01
f,-0.322356,0.856316,-1.764641,bar,False,2012-01-01
h,,0.8717,-0.094862,bar,True,NaT


In [18]:
df2.get_dtype_counts()

bool              1
datetime64[ns]    1
float64           3
object            1
dtype: int64

In [19]:
# Inserting missing data
# NsN for numbers
s = pd.Series([1,2,3])
s.loc[0] = None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [21]:
# for objects the value given
s = pd.Series(["a", "b", "c"])
s.loc[0] = None
s.loc[1] = np.nan
s

0    None
1     NaN
2       c
dtype: object

In [24]:
# Calculations with missing data
df[['one','two']]

Unnamed: 0,one,two
a,-1.728668,-0.224033
c,-0.523587,-0.458819
e,0.528344,0.918918
f,-0.322356,0.856316
h,1.191995,0.8717


In [25]:
df[['one','two','three']]

Unnamed: 0,one,two,three
a,-1.728668,-0.224033,1.029731
c,-0.523587,-0.458819,-0.730857
e,0.528344,0.918918,1.002231
f,-0.322356,0.856316,-1.764641
h,1.191995,0.8717,-0.094862


In [26]:
# propagation
df[['one','two']] + df[['one','two','three']]

Unnamed: 0,one,three,two
a,-3.457336,,-0.448065
c,-1.047175,,-0.917637
e,1.056689,,1.837836
f,-0.644712,,1.712631
h,2.38399,,1.7434


In [27]:
df[['one','two','three']]

Unnamed: 0,one,two,three
a,-1.728668,-0.224033,1.029731
c,-0.523587,-0.458819,-0.730857
e,0.528344,0.918918,1.002231
f,-0.322356,0.856316,-1.764641
h,1.191995,0.8717,-0.094862


In [29]:
df.loc[['a','c','h'],['one']] = np.nan

In [30]:
df

Unnamed: 0,one,two,three,four,five
a,,-0.224033,1.029731,bar,False
c,,-0.458819,-0.730857,bar,False
e,0.528344,0.918918,1.002231,bar,True
f,-0.322356,0.856316,-1.764641,bar,False
h,,0.8717,-0.094862,bar,True


In [31]:
df['one'].sum()

0.20598842494478259

In [32]:
df.mean(1)

a    0.268566
c   -0.396558
e    0.862374
f   -0.307670
h    0.592279
dtype: float64

In [33]:
df.cumsum()

Unnamed: 0,one,two,three,four,five
a,,-0.224033,1.02973,bar,False
c,,-0.682851,0.298874,barbar,0
e,0.528344,0.236067,1.30111,barbarbar,1
f,0.205988,1.09238,-0.463535,barbarbarbar,1
h,,1.96408,-0.558397,barbarbarbarbar,2


In [36]:
# NA values in GroupBy
df.groupby('one').mean()

Unnamed: 0_level_0,two,three,five
one,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.322356,0.856316,-1.764641,False
0.528344,0.918918,1.002231,True


In [37]:
# Cleaning/filling missing data
## filling missing values: fillna
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,-0.224033,1.029731,bar,False,NaT
c,,-0.458819,-0.730857,bar,False,NaT
e,0.528344,0.918918,1.002231,bar,True,2012-01-01
f,-0.322356,0.856316,-1.764641,bar,False,2012-01-01
h,,0.8717,-0.094862,bar,True,NaT


In [38]:
df2.fillna(0)

Unnamed: 0,one,two,three,four,five,timestamp
a,0.0,-0.224033,1.029731,bar,False,1970-01-01
c,0.0,-0.458819,-0.730857,bar,False,1970-01-01
e,0.528344,0.918918,1.002231,bar,True,2012-01-01
f,-0.322356,0.856316,-1.764641,bar,False,2012-01-01
h,0.0,0.8717,-0.094862,bar,True,1970-01-01


In [39]:
df2['four'].fillna('missing')

a    bar
c    bar
e    bar
f    bar
h    bar
Name: four, dtype: object

In [41]:
### Fill gaps forward or backward
df.fillna(method='pad')

Unnamed: 0,one,two,three,four,five
a,,-0.224033,1.029731,bar,False
c,,-0.458819,-0.730857,bar,False
e,0.528344,0.918918,1.002231,bar,True
f,-0.322356,0.856316,-1.764641,bar,False
h,-0.322356,0.8717,-0.094862,bar,True


In [43]:
# limit the amount 
df.loc[['e','f'],['two','three']] = np.nan

In [44]:
df

Unnamed: 0,one,two,three,four,five
a,,-0.224033,1.029731,bar,False
c,,-0.458819,-0.730857,bar,False
e,0.528344,,,bar,True
f,-0.322356,,,bar,False
h,,0.8717,-0.094862,bar,True


In [45]:
df.fillna(method='pad',limit=1)

Unnamed: 0,one,two,three,four,five
a,,-0.224033,1.029731,bar,False
c,,-0.458819,-0.730857,bar,False
e,0.528344,-0.458819,-0.730857,bar,True
f,-0.322356,,,bar,False
h,-0.322356,0.8717,-0.094862,bar,True


In [46]:
# pad/ffill forward fill
# bfill/backfill  backward fill
# alignable
dff = pd.DataFrame(np.random.randn(10,3), columns=list('ABC'))
dff.iloc[3:5,0] = np.nan
dff.iloc[4:6,1] = np.nan
dff.iloc[5:8,2] = np.nan
dff

Unnamed: 0,A,B,C
0,0.237461,-1.482179,0.142737
1,-1.916896,0.048031,0.661916
2,-0.851745,-1.169174,0.809534
3,,0.099137,0.416297
4,,,0.477101
5,0.294362,,
6,-1.41235,1.007052,
7,2.451519,0.415144,
8,-0.949253,-1.26715,-0.764022
9,-0.210458,-1.428479,-0.075075


In [47]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,0.237461,-1.482179,0.142737
1,-1.916896,0.048031,0.661916
2,-0.851745,-1.169174,0.809534
3,-0.29467,0.099137,0.416297
4,-0.29467,-0.472202,0.477101
5,0.294362,-0.472202,0.238355
6,-1.41235,1.007052,0.238355
7,2.451519,0.415144,0.238355
8,-0.949253,-1.26715,-0.764022
9,-0.210458,-1.428479,-0.075075


In [48]:
dff.fillna(dff.mean()['B':'C'])

Unnamed: 0,A,B,C
0,0.237461,-1.482179,0.142737
1,-1.916896,0.048031,0.661916
2,-0.851745,-1.169174,0.809534
3,,0.099137,0.416297
4,,-0.472202,0.477101
5,0.294362,-0.472202,0.238355
6,-1.41235,1.007052,0.238355
7,2.451519,0.415144,0.238355
8,-0.949253,-1.26715,-0.764022
9,-0.210458,-1.428479,-0.075075


In [49]:
dff.where(pd.notnull(dff),dff.mean(),axis='columns')

Unnamed: 0,A,B,C
0,0.237461,-1.482179,0.142737
1,-1.916896,0.048031,0.661916
2,-0.851745,-1.169174,0.809534
3,-0.29467,0.099137,0.416297
4,-0.29467,-0.472202,0.477101
5,0.294362,-0.472202,0.238355
6,-1.41235,1.007052,0.238355
7,2.451519,0.415144,0.238355
8,-0.949253,-1.26715,-0.764022
9,-0.210458,-1.428479,-0.075075


In [53]:
# Dropping axis labels with missing data
df.loc[['e','f'],['two','three']] = 0
df

Unnamed: 0,one,two,three,four,five
a,,-0.224033,1.029731,bar,False
c,,-0.458819,-0.730857,bar,False
e,0.528344,0.0,0.0,bar,True
f,-0.322356,0.0,0.0,bar,False
h,,0.8717,-0.094862,bar,True


In [54]:
df.dropna(axis=0)

Unnamed: 0,one,two,three,four,five
e,0.528344,0.0,0.0,bar,True
f,-0.322356,0.0,0.0,bar,False


In [55]:
df.dropna(axis=1)

Unnamed: 0,two,three,four,five
a,-0.224033,1.029731,bar,False
c,-0.458819,-0.730857,bar,False
e,0.0,0.0,bar,True
f,0.0,0.0,bar,False
h,0.8717,-0.094862,bar,True


In [56]:
df['one'].dropna()

e    0.528344
f   -0.322356
Name: one, dtype: float64

In [68]:
# Interpolation
#ts = pd.DataFrame([['2000-01-31', 0.469112], ['2000-02-29',np.NaN]])
date = ['2000-01-31','2000-02-29','2000-03-31','2000-04-28']
record = [0.469112,np.NaN,np.NaN,np.NaN]

In [69]:
date
record

[0.469112, nan, nan, nan]

In [75]:
ts = pd.DataFrame(date,record)
pd.to_datetime(date)
date

['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28']