In [1]:
import numpy as np
import pandas as pd

In [2]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
48.4 ms ± 1.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
1.8 ms ± 213 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)



What is the best way to represent missing data in other data types, for example strings? There is a difference between a missing string and a string that is empty.

In [3]:
data = pd.Series([1, np.nan, 'hello', None])

In [4]:
data[data.isnull()]

1     NaN
3    None
dtype: object

In [5]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [6]:
data.dropna()

0        1
2    hello
dtype: object

In [7]:
data.fillna('-999')

0        1
1     -999
2    hello
3     -999
dtype: object

In [8]:
pd.Series([1, 2, np.nan, 4], dtype='Int64')

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [9]:
pd.Series([1, 2, np.nan, 4], dtype='category')

0      1
1      2
2    NaN
3      4
dtype: category
Categories (3, int64): [1, 2, 4]

In [10]:
pd.Series([1, 2, np.nan, 4], dtype='float') + 4

0    5.0
1    6.0
2    NaN
3    8.0
dtype: float64

In [11]:
pd.Series([1, 2, np.nan, 4], dtype='string') + ' hello'

0    1 hello
1    2 hello
2       <NA>
3    4 hello
dtype: string

In [12]:
df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)

In [13]:
df

Unnamed: 0,one,two,three
a,0.759102,0.434146,0.30659
c,0.559493,-0.141612,-0.361495
e,-1.65889,-0.458098,0.099869
f,-0.266971,-0.354658,-0.710631
h,0.146599,1.645907,-0.009718


In [14]:
df['one'] = [np.nan, 1, 2, 2, np.nan]

In [15]:
df

Unnamed: 0,one,two,three
a,,0.434146,0.30659
c,1.0,-0.141612,-0.361495
e,2.0,-0.458098,0.099869
f,2.0,-0.354658,-0.710631
h,,1.645907,-0.009718


In [16]:
df.groupby('one').mean()

Unnamed: 0_level_0,two,three
one,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,-0.141612,-0.361495
2.0,-0.406378,-0.305381


In [17]:
df['one'].fillna('missing')

a    missing
c        1.0
e        2.0
f        2.0
h    missing
Name: one, dtype: object

In [18]:
df2 = df.copy()
df2['one'] = df2['one'].fillna('missing')
df2

Unnamed: 0,one,two,three
a,missing,0.434146,0.30659
c,1.0,-0.141612,-0.361495
e,2.0,-0.458098,0.099869
f,2.0,-0.354658,-0.710631
h,missing,1.645907,-0.009718


In [19]:
df2.groupby('one').mean()

Unnamed: 0_level_0,two,three
one,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,-0.141612,-0.361495
2.0,-0.406378,-0.305381
missing,1.040027,0.148436
