# 15.1 Missing data basics

## 15.1.1 When / why does data become missing?

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

  return f(*args, **kwds)


In [4]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
a,1.801788,-0.147131,-1.035582
c,0.844024,0.262872,0.694794
e,-0.112501,-0.121731,-1.021901
f,2.075975,-0.854873,-1.011352
h,-0.304782,-1.229473,1.052662


In [7]:
df['four'] = 'bar'
df['five'] = df['one'] > 0
df

Unnamed: 0,one,two,three,four,five
a,1.801788,-0.147131,-1.035582,bar,True
c,0.844024,0.262872,0.694794,bar,True
e,-0.112501,-0.121731,-1.021901,bar,False
f,2.075975,-0.854873,-1.011352,bar,True
h,-0.304782,-1.229473,1.052662,bar,False


In [9]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df2

Unnamed: 0,one,two,three,four,five
a,1.801788,-0.147131,-1.035582,bar,True
b,,,,,
c,0.844024,0.262872,0.694794,bar,True
d,,,,,
e,-0.112501,-0.121731,-1.021901,bar,False
f,2.075975,-0.854873,-1.011352,bar,True
g,,,,,
h,-0.304782,-1.229473,1.052662,bar,False


## 15.1.2 Values considered “missing”

In [10]:
df2['one']

a    1.801788
b         NaN
c    0.844024
d         NaN
e   -0.112501
f    2.075975
g         NaN
h   -0.304782
Name: one, dtype: float64

In [11]:
pd.isna(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [12]:
df2['four'].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [13]:
df2.isna()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


# 15.2 Datetimes

In [14]:
df2 = df.copy()

In [16]:
df2['timestamp'] = pd.Timestamp('20120101')
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,1.801788,-0.147131,-1.035582,bar,True,2012-01-01
c,0.844024,0.262872,0.694794,bar,True,2012-01-01
e,-0.112501,-0.121731,-1.021901,bar,False,2012-01-01
f,2.075975,-0.854873,-1.011352,bar,True,2012-01-01
h,-0.304782,-1.229473,1.052662,bar,False,2012-01-01


In [17]:
df2.loc[['a','c','h'],['one','timestamp']] = np.nan

In [18]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,-0.147131,-1.035582,bar,True,NaT
c,,0.262872,0.694794,bar,True,NaT
e,-0.112501,-0.121731,-1.021901,bar,False,2012-01-01
f,2.075975,-0.854873,-1.011352,bar,True,2012-01-01
h,,-1.229473,1.052662,bar,False,NaT


# 15.3 Inserting missing data

You can insert missing values by simply assigning to containers. The actual missing value used will be chosen based
on the dtype.

In [19]:
s = pd.Series([1, 2, 3])

In [21]:
s.loc[0] = None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [25]:
s = pd.Series(["a", "b", "c"])
s.loc[0] = None
s.loc[1] = np.nan
s

0    None
1     NaN
2       c
dtype: object

# 15.4 Calculations with missing data