Data Types
==========



In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

## Integral types



### How to do it



In [2]:
pd.Series(range(3), dtype=pd.Int64Dtype())

0    0
1    1
2    2
dtype: Int64

In [3]:
pd.Series(range(3), dtype=pd.Int8Dtype())

0    0
1    1
2    2
dtype: Int8

In [4]:
pd.Series([1, pd.NA, 2], dtype=pd.Int64Dtype())

0       1
1    <NA>
2       2
dtype: Int64

In [5]:
pd.Series([1, None, 2], dtype=pd.Int64Dtype())

0       1
1    <NA>
2       2
dtype: Int64

### There's more



In [6]:
pd.Series(range(555, 558), dtype=pd.Int16Dtype())

0    555
1    556
2    557
dtype: Int16

In [7]:
pd.Series(range(3), dtype=pd.UInt8Dtype())

0    0
1    1
2    2
dtype: UInt8

## Floating point types



### How to do it



In [8]:
pd.Series([3.14, .333333333, -123.456], dtype=pd.Float64Dtype())

0        3.14
1    0.333333
2    -123.456
dtype: Float64

In [9]:
pd.Series([3.14, None, pd.NA], dtype=pd.Float64Dtype())

0    3.14
1    <NA>
2    <NA>
dtype: Float64

### There's more



In [10]:
ser1 = pd.Series([1_000_000.123], dtype=pd.Float32Dtype())
ser2 = pd.Series([1_000_000.124], dtype=pd.Float32Dtype())
ser1.eq(ser2)

0    True
dtype: boolean

## Boolean types



### How to do it



In [11]:
pd.Series([True, False, True], dtype=pd.BooleanDtype())

0     True
1    False
2     True
dtype: boolean

In [12]:
pd.Series([1, 0, 1], dtype=pd.BooleanDtype())

0     True
1    False
2     True
dtype: boolean

In [13]:
pd.Series([1, pd.NA, None], dtype=pd.BooleanDtype())

0    True
1    <NA>
2    <NA>
dtype: boolean

## String types



### How do to it



In [14]:
pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype())

0    foo
1    bar
2    baz
dtype: string

In [15]:
pd.Series(["foo", pd.NA, None], dtype=pd.StringDtype())

0     foo
1    <NA>
2    <NA>
dtype: string

In [16]:
ser = pd.Series(["xx", "YyY", "zZzZ"], dtype=pd.StringDtype())
ser.str.len()

0    2
1    3
2    4
dtype: Int64

In [17]:
ser.str.upper()

0      XX
1     YYY
2    ZZZZ
dtype: string

In [18]:
ser.str.lower()

0      xx
1     yyy
2    zzzz
dtype: string

In [19]:
ser.str.title()

0      Xx
1     Yyy
2    Zzzz
dtype: string

In [20]:
ser = pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype())
ser.str.contains("o")

0     True
1    False
2    False
dtype: boolean

In [21]:
ser.str.contains(r"^ba[rz]$", case=False, regex=True)

0    False
1     True
2     True
dtype: boolean

## Missing value handling



### How to do it



In [22]:
ser = pd.Series(range(3))
ser

0    0
1    1
2    2
dtype: int64

In [23]:
ser.iloc[1] = None
ser

0    0.0
1    NaN
2    2.0
dtype: float64

In [24]:
pd.isna(pd.Series([1, np.nan, 2]))

0    False
1     True
2    False
dtype: bool

In [25]:
pd.isna(pd.Series([1, pd.NA, 2], dtype=pd.Int64Dtype()))

0    False
1     True
2    False
dtype: bool

### There's more



In [26]:
ser = pd.Series(range(3), dtype=pd.Int64Dtype())
mask = pd.Series([True, pd.NA, False], dtype=pd.BooleanDtype())
ser[mask]

0    0
dtype: Int64

In [27]:
mask = pd.Series([True, None, False])
ser[mask]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [28]:
mask = pd.Series([True, None, False])
mask = mask.fillna(False).astype(bool)
ser[mask]

  mask = mask.fillna(False).astype(bool)


0    0
dtype: Int64

## Categorical types



### How to do it



In [29]:
values = ["foo", "bar", "baz"]
values_ser = pd.Series(values, dtype=pd.StringDtype())

In [30]:
ser = values_ser.astype(pd.CategoricalDtype())
ser

0    foo
1    bar
2    baz
dtype: category
Categories (3, string): [bar, baz, foo]

In [31]:
cat = pd.CategoricalDtype(values_ser)
ser = pd.Series(values, dtype=cat)
ser

0    foo
1    bar
2    baz
dtype: category
Categories (3, string): [foo, bar, baz]

In [32]:
ser.iloc[2] = "foo"
ser

0    foo
1    bar
2    foo
dtype: category
Categories (3, string): [foo, bar, baz]

In [33]:
ser.iloc[2] = "qux"

TypeError: Cannot setitem on a Categorical with a new category (qux), set the categories first

In [34]:
shirt_sizes = pd.Series(["S", "M", "L", "XL"], dtype=pd.StringDtype())
cat = pd.CategoricalDtype(shirt_sizes, ordered=True)
ser = pd.Series(["XL", "L", "S", "L", "S", "M"], dtype=cat)
ser < "L"

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool

In [35]:
accepted_values = pd.Series(["foo", "bar"], dtype=pd.StringDtype())
cat = pd.CategoricalDtype(accepted_values)
ser = pd.Series(["foo", "bar", "foo"], dtype=cat)
ser

0    foo
1    bar
2    foo
dtype: category
Categories (2, string): [foo, bar]

In [36]:
ser.cat.codes

0    0
1    1
2    0
dtype: int8

In [37]:
ser.cat.categories

Index(['foo', 'bar'], dtype='string')

In [38]:
pd.Series(["foo", "bar", "baz"] * 100, dtype=pd.StringDtype()).memory_usage()

2528

In [39]:
pd.Series(["foo", "bar", "baz"] * 100, dtype=cat).memory_usage()

552

### There's more



In [40]:
pd.Series(["foo", "bar", pd.NA], dtype=pd.CategoricalDtype())

0    foo
1    bar
2    NaN
dtype: category
Categories (2, object): ['bar', 'foo']

In [41]:
values = pd.Series(["foo", "bar"], dtype=pd.StringDtype())
cat = pd.CategoricalDtype(values)
pd.Series(["foo", "bar", pd.NA], dtype=cat)

0     foo
1     bar
2    <NA>
dtype: category
Categories (2, string): [foo, bar]

## Temporal types - datetime



### How to do it



In [42]:
ser = pd.Series([
    "2024-01-01 00:00:00",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:02"
], dtype="datetime64[ns]")
ser

0   2024-01-01 00:00:00
1   2024-01-02 00:00:01
2   2024-01-03 00:00:02
dtype: datetime64[ns]

In [43]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03"
], dtype="datetime64[ns]")
ser

0   2024-01-01
1   2024-01-02
2   2024-01-03
dtype: datetime64[ns]

In [44]:
ser.iloc[1] = "2024-01-04 00:00:42"
ser

0   2024-01-01 00:00:00
1   2024-01-04 00:00:42
2   2024-01-03 00:00:00
dtype: datetime64[ns]

In [45]:
ser.dt.year

0    2024
1    2024
2    2024
dtype: int32

In [46]:
ser.dt.month

0    1
1    1
2    1
dtype: int32

In [47]:
ser.dt.day

0    1
1    4
2    3
dtype: int32

In [48]:
ser.dt.day_of_week

0    0
1    3
2    2
dtype: int32

In [49]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="UTC"))

0   2024-01-01 00:00:01+00:00
1   2024-01-02 00:00:01+00:00
2   2024-01-03 00:00:01+00:00
dtype: datetime64[ns, UTC]

In [50]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="America/New_York"))

0   2024-01-01 00:00:01-05:00
1   2024-01-02 00:00:01-05:00
2   2024-01-03 00:00:01-05:00
dtype: datetime64[ns, America/New_York]

In [51]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="-05:00"))

0   2024-01-01 00:00:01-05:00
1   2024-01-02 00:00:01-05:00
2   2024-01-03 00:00:01-05:00
dtype: datetime64[ns, UTC-05:00]

In [52]:
ser_no_tz = pd.Series([
    "2024-01-01 00:00:00",
    "2024-01-01 00:01:10",
    "2024-01-01 00:02:42"
], dtype="datetime64[ns]")
ser_et = ser_no_tz.dt.tz_localize("America/New_York")
ser_et

0   2024-01-01 00:00:00-05:00
1   2024-01-01 00:01:10-05:00
2   2024-01-01 00:02:42-05:00
dtype: datetime64[ns, America/New_York]

In [53]:
ser_pt = ser_et.dt.tz_convert("America/Los_Angeles")
ser_pt

0   2023-12-31 21:00:00-08:00
1   2023-12-31 21:01:10-08:00
2   2023-12-31 21:02:42-08:00
dtype: datetime64[ns, America/Los_Angeles]

In [54]:
ser_pt.dt.normalize()

0   2023-12-31 00:00:00-08:00
1   2023-12-31 00:00:00-08:00
2   2023-12-31 00:00:00-08:00
dtype: datetime64[ns, America/Los_Angeles]

In [55]:
ser = pd.Series([
    "2024-01-01",
    None,
    "2024-01-03"
], dtype="datetime64[ns]")
ser

0   2024-01-01
1          NaT
2   2024-01-03
dtype: datetime64[ns]

In [56]:
pd.isna(ser)

0    False
1     True
2    False
dtype: bool

### There's more



In [57]:
pd.Series([
    "1500-01-01 00:00:01",
    "2500-01-01 00:00:01",
], dtype="datetime64[ns]")

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1500-01-01 00:00:01, at position 0

In [58]:
pd.Series([
    "1500-01-01 00:00:01",
    "2500-01-01 00:00:01",
], dtype="datetime64[us]")

0   1500-01-01 00:00:01
1   2500-01-01 00:00:01
dtype: datetime64[us]

## Temporal types - Timedelta



### How to do it



In [59]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03"
], dtype="datetime64[ns]")
ser - pd.Timestamp("2023-12-31 12:00:00")

0   0 days 12:00:00
1   1 days 12:00:00
2   2 days 12:00:00
dtype: timedelta64[ns]

In [60]:
ser + pd.Timedelta("3 days")

0   2024-01-04
1   2024-01-05
2   2024-01-06
dtype: datetime64[ns]

### There's more



In [61]:
pd.Series([
    "-1 days",
    "6 hours",
    "42 minutes",
    "12 seconds",
    "8 milliseconds",
    "4 microseconds",
    "300 nanoseconds",
], dtype="timedelta64[ns]")

0           -1 days +00:00:00
1             0 days 06:00:00
2             0 days 00:42:00
3             0 days 00:00:12
4      0 days 00:00:00.008000
5      0 days 00:00:00.000004
6   0 days 00:00:00.000000300
dtype: timedelta64[ns]

In [62]:
pd.Series([
    "1 months",
], dtype="timedelta64[ns]")

ValueError: invalid unit abbreviation: months

## Temporal PyArrow types



### How to do it



In [63]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03",
], dtype=pd.ArrowDtype(pa.date32()))
ser

0    2024-01-01
1    2024-01-02
2    2024-01-03
dtype: date32[day][pyarrow]

In [64]:
ser = pd.Series([
    "9999-12-29",
    "9999-12-30",
    "9999-12-31",
], dtype=pd.ArrowDtype(pa.date32()))
ser

0    9999-12-29
1    9999-12-30
2    9999-12-31
dtype: date32[day][pyarrow]

## PyArrow List types



In [65]:
df = pd.DataFrame({
    "name": ["Alice", "Bob", "Janice", "Jim", "Michael"],
    "years_exp": [10, 2, 4, 8, 6],
})
df

Unnamed: 0,name,years_exp
0,Alice,10
1,Bob,2
2,Janice,4
3,Jim,8
4,Michael,6


### How to do it



In [66]:
ser = pd.Series([
    ["Bob", "Michael"],
    None,
    None,
    ["Janice"],
    None,
], dtype=pd.ArrowDtype(pa.list_(pa.string())))
df["direct_reports"] = ser
df

Unnamed: 0,name,years_exp,direct_reports
0,Alice,10,['Bob' 'Michael']
1,Bob,2,
2,Janice,4,
3,Jim,8,['Janice']
4,Michael,6,


\*



### There's more



In [67]:
ser.list.len()

0       2
1    <NA>
2    <NA>
3       1
4    <NA>
dtype: int32[pyarrow]

In [68]:
ser.list[0]

0       Bob
1      <NA>
2      <NA>
3    Janice
4      <NA>
dtype: string[pyarrow]

In [69]:
ser.list.flatten()

0        Bob
1    Michael
2     Janice
dtype: string[pyarrow]

## PyArrow Decimal types



### How to do it



In [70]:
pd.Series([
    "123456789.123456789",
    "-987654321.987654321",
    "99999999.9999999999",
], dtype=pd.ArrowDtype(pa.decimal128(19, 10)))

0     123456789.1234567890
1    -987654321.9876543210
2      99999999.9999999999
dtype: decimal128(19, 10)[pyarrow]

In [71]:
pd.Series([
    123456789.123456789,
    -987654321.987654321,
    99999999.9999999999,
], dtype=pd.ArrowDtype(pa.decimal128(19, 10)))

0     123456789.1234567910
1    -987654321.9876543283
2     100000000.0000000000
dtype: decimal128(19, 10)[pyarrow]

In [72]:
import decimal
decimal.Decimal("99999999.9999999999") == decimal.Decimal("100000000.0")

False

In [73]:
decimal.Decimal("99999999.9999999999") + decimal.Decimal("100000000.0")

Decimal('199999999.9999999999')

In [74]:
pd.Series([
    decimal.Decimal("123456789.123456789"),
    decimal.Decimal("-987654321.987654321"),
    decimal.Decimal("99999999.9999999999"),
], dtype=pd.ArrowDtype(pa.decimal128(19, 10)))

0     123456789.1234567890
1    -987654321.9876543210
2      99999999.9999999999
dtype: decimal128(19, 10)[pyarrow]

### There's more



In [75]:
ser = pd.Series([
    "123456789123456789123456789123456789.123456789"
], dtype=pd.ArrowDtype(pa.decimal256(76, 10)))
ser

0    123456789123456789123456789123456789.1234567890
dtype: decimal256(76, 10)[pyarrow]

## NumPy type system, the `object` type, and pitfalls



### How to do it



In [76]:
pd.Series([0, 1, 2])

0    0
1    1
2    2
dtype: int64

In [77]:
pd.Series([0, None, 2])

0    0.0
1    NaN
2    2.0
dtype: float64

In [78]:
pd.Series([0, None, 2], dtype=int)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [79]:
ser = pd.Series([0, None, 2])
ser.fillna(0).astype(int)

0    0
1    0
2    2
dtype: int64

In [80]:
pd.Series([0, None, 2]).fillna(0).astype(int).mean()

0.6666666666666666

In [81]:
pd.Series([0, None, 2], dtype=pd.Int64Dtype()).mean()

1.0

In [82]:
import timeit
func = lambda: pd.Series([0, None, 2]).fillna(0).astype(int).mean()
timeit.timeit(func, number=10_000)

1.1126371330028633

In [83]:
func = lambda: pd.Series([0, None, 2], dtype=pd.Int64Dtype()).mean()
timeit.timeit(func, number=10_000)

0.6666043780060136

In [84]:
pd.Series([True, False])

0     True
1    False
dtype: bool

In [85]:
pd.Series([True, False, None])

0     True
1    False
2     None
dtype: object

In [86]:
pd.Series([True, False, None, "one of these things", ["is not like"], ["the other"]])

0                   True
1                  False
2                   None
3    one of these things
4          [is not like]
5            [the other]
dtype: object

In [87]:
pd.Series([True, False, None], dtype=pd.BooleanDtype())

0     True
1    False
2     <NA>
dtype: boolean

In [88]:
pd.Series(["foo", "bar", "baz"])

0    foo
1    bar
2    baz
dtype: object

In [89]:
ser = pd.Series(["foo", "bar", "baz"])
ser.iloc[2] = 42
ser

0    foo
1    bar
2     42
dtype: object

In [90]:
ser = pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype())
ser.iloc[2] = 42

TypeError: Cannot set non-string value '42' into a StringArray.

### There's more



In [91]:
alist = [42, "foo", ["sub", "list"], {"key": "value"}]
ser = pd.Series(alist)
ser

0                  42
1                 foo
2         [sub, list]
3    {'key': 'value'}
dtype: object

In [92]:
df = pd.DataFrame([
    ["foo", 1, 123.45],
    ["bar", 2, 333.33],
    ["baz", 3, 999.99],
], columns=list("abc"))
df

Unnamed: 0,a,b,c
0,foo,1,123.45
1,bar,2,333.33
2,baz,3,999.99


In [93]:
df.dtypes

a     object
b      int64
c    float64
dtype: object

In [94]:
df.astype({
    "a": pd.StringDtype(),
    "b": pd.Int64Dtype(),
    "c": pd.Float64Dtype(),
}).dtypes

a    string[python]
b             Int64
c           Float64
dtype: object

In [95]:
df.convert_dtypes(dtype_backend="numpy_nullable").dtypes

a    string[python]
b             Int64
c           Float64
dtype: object