In [1]:
import numpy as np
import pandas as pd

In [2]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
52.7 ms ± 529 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
1.44 ms ± 33.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)



What is the best way to represent missing data in other data types, for example strings? There is a difference between a missing string and a string that is empty.

In [3]:
data = pd.Series([1, np.nan, 'hello', None])

In [4]:
data[data.isnull()]

1     NaN
3    None
dtype: object

In [5]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [6]:
data.dropna()

0        1
2    hello
dtype: object

In [7]:
data.fillna('-999')

0        1
1     -999
2    hello
3     -999
dtype: object

In [8]:
pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [9]:
pd.Series([1, 2, np.nan, 4], dtype='Int64')

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [10]:
df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)

In [11]:
df

Unnamed: 0,one,two,three
a,-0.047729,0.163495,0.373096
c,-1.470597,0.915313,0.598639
e,0.167673,0.51418,-2.346499
f,-0.736491,-0.549931,-0.020876
h,0.941915,1.522123,0.747976


In [12]:
df['one'] = [np.nan, 1, 2, 2, np.nan]

In [13]:
df

Unnamed: 0,one,two,three
a,,0.163495,0.373096
c,1.0,0.915313,0.598639
e,2.0,0.51418,-2.346499
f,2.0,-0.549931,-0.020876
h,,1.522123,0.747976


In [14]:
df.groupby('one').mean()

Unnamed: 0_level_0,two,three
one,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.915313,0.598639
2.0,-0.017875,-1.183687


In [15]:
df['one'].fillna('missing')

a    missing
c        1.0
e        2.0
f        2.0
h    missing
Name: one, dtype: object

In [16]:
df2 = df.copy()
df2['one'] = df2['one'].fillna('missing')
df2

Unnamed: 0,one,two,three
a,missing,0.163495,0.373096
c,1.0,0.915313,0.598639
e,2.0,0.51418,-2.346499
f,2.0,-0.549931,-0.020876
h,missing,1.522123,0.747976


In [17]:
df2.groupby('one').mean()

Unnamed: 0_level_0,two,three
one,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.915313,0.598639
2.0,-0.017875,-1.183687
missing,0.842809,0.560536
