Selection and Assignment
========================



In [5]:
import pandas as pd
import numpy as np
import pyarrow as pa

## Basic selection from a pd.Series



### How to do it



In [13]:
ser = pd.Series(list("abc") * 3)
ser

0    a
1    b
2    c
3    a
4    b
5    c
6    a
7    b
8    c
dtype: object

In [3]:
ser[3]

'a'

In [7]:
ser[[3]]

3    a
dtype: object

In [5]:
ser[[0, 2]]

0    a
2    c
dtype: object

In [14]:
ser[:3]

0    a
1    b
2    c
dtype: object

In [7]:
ser[-4:]

5    c
6    a
7    b
8    c
dtype: object

In [8]:
ser[2:6]

2    c
3    a
4    b
5    c
dtype: object

In [None]:
ser[1:8:3]

SyntaxError: invalid syntax (1049567702.py, line 1)

In [10]:
ser = pd.Series(range(3), index=["Jack", "Jill", "Jayne"])
ser

Jack     0
Jill     1
Jayne    2
dtype: int64

In [11]:
ser["Jill"]

1

In [12]:
ser[["Jill"]]

Jill    1
dtype: int64

### There's more&#x2026;



In [8]:
ser = pd.Series(list("abc"), index=[2, 42, 21])
ser

2     a
42    b
21    c
dtype: object

In [11]:
ser[2]

'a'

In [10]:
ser[:2]

2     a
42    b
dtype: object

In [16]:
ser = pd.Series(["apple", "banana", "orange"], index=[0, 1, 1])
ser

0     apple
1    banana
1    orange
dtype: object

In [17]:
ser[1]

1    banana
1    orange
dtype: object

## Basic selection from a pd.DataFrame



### How to do it



In [18]:
df = pd.DataFrame(np.arange(9).reshape(3, -1), columns=["a", "b", "c"])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [19]:
df["a"]

0    0
1    3
2    6
Name: a, dtype: int64

In [20]:
df[["a"]]

Unnamed: 0,a
0,0
1,3
2,6


In [21]:
df[["a", "b"]]

Unnamed: 0,a,b
0,0,1
1,3,4
2,6,7


In [22]:
df[:2]

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5


### There's more&#x2026;



In [23]:
df[["a", "b"]]

Unnamed: 0,a,b
0,0,1
1,3,4
2,6,7


In [24]:
df[["b", "a"]]

Unnamed: 0,b,a
0,1,0
1,4,3
2,7,6


## Position-based selection of a pd.Series



### How to do it



In [25]:
ser = pd.Series(["apple", "banana", "orange"], index=[0, 1, 1])
ser

0     apple
1    banana
1    orange
dtype: object

In [26]:
ser.iloc[1]

'banana'

In [27]:
ser.iloc[[1]]

1    banana
dtype: object

In [28]:
ser.iloc[[0, 2]]

0     apple
1    orange
dtype: object

In [29]:
ser.iloc[:2]

0     apple
1    banana
dtype: object

## Position-based selection of a pd.DataFrame



### How to do it



In [30]:
df = pd.DataFrame(np.arange(20).reshape(5, -1), columns=list("abcd"))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [31]:
df.iloc[2, 2]

10

In [32]:
df.iloc[:, 0]

0     0
1     4
2     8
3    12
4    16
Name: a, dtype: int64

In [33]:
df.iloc[0, :]

a    0
b    1
c    2
d    3
Name: 0, dtype: int64

In [34]:
df.iloc[:, [0]]

Unnamed: 0,a
0,0
1,4
2,8
3,12
4,16


In [35]:
df.iloc[[0], :]

Unnamed: 0,a,b,c,d
0,0,1,2,3


In [36]:
df.iloc[[0, 1], [-1, -2]]

Unnamed: 0,d,c
0,3,2
1,7,6


### There's more&#x2026;



## Label-based selection from a pd.Series



### How to do it



In [37]:
ser = pd.Series(["apple", "banana", "orange"], index=[0, 1, 1])
ser

0     apple
1    banana
1    orange
dtype: object

In [38]:
ser.loc[1]

1    banana
1    orange
dtype: object

In [39]:
ser = pd.Series([2, 2, 4], index=["dog", "cat", "human"], name="num_legs")
ser

dog      2
cat      2
human    4
Name: num_legs, dtype: int64

In [40]:
ser.loc["dog"]

2

In [41]:
ser.loc[["dog", "cat"]]

dog    2
cat    2
Name: num_legs, dtype: int64

In [42]:
ser.loc[:"cat"]

dog    2
cat    2
Name: num_legs, dtype: int64

### There's more&#x2026;



In [43]:
values = ["Jack", "Jill", "Jayne"]
ser = pd.Series(values)
ser

0     Jack
1     Jill
2    Jayne
dtype: object

In [44]:
values[:2]

['Jack', 'Jill']

In [45]:
ser.iloc[:2]

0    Jack
1    Jill
dtype: object

In [46]:
ser.loc[:2]

0     Jack
1     Jill
2    Jayne
dtype: object

In [47]:
repeats_2 = pd.Series(range(5), index=[0, 1, 2, 2, 0])
repeats_2.loc[:2]

0    0
1    1
2    2
2    3
dtype: int64

In [48]:
ser = pd.Series(range(4), index=["zzz", "xxx", "xxx", "yyy"])
ser.loc[:"xxx"]

zzz    0
xxx    1
xxx    2
dtype: int64

In [49]:
ser = pd.Series(range(4), index=["zzz", "xxx", "yyy", "xxx"])
ser.loc[:"xxx"]

KeyError: "Cannot get right slice bound for non-unique label: 'xxx'"

## Label-based selection from a pd.DataFrame



### How to do it



In [50]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=["age", "height_cm", "eye_color"], index=["Jack", "Jill", "Jayne"])
df

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown
Jayne,22,160,green


In [51]:
df.loc["Jayne", "eye_color"]

'green'

In [52]:
df.loc[:, "age"]

Jack     24
Jill     42
Jayne    22
Name: age, dtype: int64

In [53]:
df.loc["Jack", :]

age            24
height_cm     180
eye_color    blue
Name: Jack, dtype: object

In [54]:
df.loc[:, ["age"]]

Unnamed: 0,age
Jack,24
Jill,42
Jayne,22


In [55]:
df.loc[["Jack"], :]

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue


In [56]:
df.loc[["Jack", "Jill"], ["age", "eye_color"]]

Unnamed: 0,age,eye_color
Jack,24,blue
Jill,42,brown


## Mixing position-based and label-based selection



### How to do it



In [57]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=["age", "height_cm", "eye_color"])

df

Unnamed: 0,age,height_cm,eye_color
0,24,180,blue
1,42,166,brown
2,22,160,green


In [58]:
col_idxer = df.columns.get_indexer(["age", "eye_color"])
col_idxer

array([0, 2])

In [59]:
df.iloc[[0, 1], col_idxer]

Unnamed: 0,age,eye_color
0,24,blue
1,42,brown


### There's more



In [60]:
df[["age", "eye_color"]].iloc[[0, 1]]

Unnamed: 0,age,eye_color
0,24,blue
1,42,brown


In [61]:
import timeit

def get_indexer_approach():
    col_idxer = df.columns.get_indexer(["age", "eye_color"])
    df.iloc[[0, 1], col_idxer]

timeit.timeit(get_indexer_approach, number=10_000)

1.8550945819952176

In [62]:
two_step_approach = lambda: df[["age", "eye_color"]].iloc[[0, 1]]
timeit.timeit(two_step_approach, number=10_000)

2.0871516990009695

## pd.DataFrame.filter



### How to do it



In [63]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=[
    "age",
    "height_cm",
    "eye_color"
], index=["Jack", "Jill", "Jayne"])
df

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown
Jayne,22,160,green


In [64]:
df.filter(["age", "eye_color"])

Unnamed: 0,age,eye_color
Jack,24,blue
Jill,42,brown
Jayne,22,green


In [65]:
df.filter(["Jack", "Jill"], axis=0)

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown


In [66]:
df.filter(like="_")

Unnamed: 0,height_cm,eye_color
Jack,180,blue
Jill,166,brown
Jayne,160,green


In [67]:
df.filter(regex=r"^Ja.*(?<!e)$", axis=0)

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue


## Selection by data type



### How to do it



In [68]:
df = pd.DataFrame([
    [0, 1.0, "2"],
    [4, 8.0, "16"],
], columns=["int_col", "float_col", "string_col"])
df

Unnamed: 0,int_col,float_col,string_col
0,0,1.0,2
1,4,8.0,16


In [69]:
df.select_dtypes("int")

Unnamed: 0,int_col
0,0
1,4


In [70]:
df.select_dtypes(include=["int", "float"])

Unnamed: 0,int_col,float_col
0,0,1.0
1,4,8.0


In [71]:
df.select_dtypes(exclude=["int", "float"])

Unnamed: 0,string_col
0,2
1,16


## Selection / filtering via Boolean arrays



### How to do it



In [72]:
mask = [True, False, True]
ser = pd.Series(range(3))
ser

0    0
1    1
2    2
dtype: int64

In [73]:
ser[mask]

0    0
2    2
dtype: int64

In [74]:
ser.loc[mask]

0    0
2    2
dtype: int64

In [75]:
df = pd.DataFrame(np.arange(6).reshape(3, -1))
df[mask]

Unnamed: 0,0,1
0,0,1
2,4,5


In [76]:
col_mask = [True, False]
df.loc[mask, col_mask]

Unnamed: 0,0
0,0
2,4


### There's more&#x2026;



In [77]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=["age", "height_cm", "eye_color"], index=["Jack", "Jill", "Jayne"])
df

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown
Jayne,22,160,green


In [78]:
blue_eyes = df["eye_color"] == "blue"
blue_eyes

Jack      True
Jill     False
Jayne    False
Name: eye_color, dtype: bool

In [79]:
green_eyes = df["eye_color"] == "green"
green_eyes

Jack     False
Jill     False
Jayne     True
Name: eye_color, dtype: bool

In [80]:
mask = blue_eyes | green_eyes
mask

Jack      True
Jill     False
Jayne     True
Name: eye_color, dtype: bool

In [81]:
df[mask]

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jayne,22,160,green


In [82]:
age_lt_40 = df["age"] < 40
age_lt_40

Jack      True
Jill     False
Jayne     True
Name: age, dtype: bool

In [83]:
height_gt_170 = df["height_cm"] > 170
height_gt_170

Jack      True
Jill     False
Jayne    False
Name: height_cm, dtype: bool

In [84]:
df[age_lt_40 & height_gt_170]

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue


In [85]:
df[~(age_lt_40 & height_gt_170)]

Unnamed: 0,age,height_cm,eye_color
Jill,42,166,brown
Jayne,22,160,green


## Selection with a pd.MultiIndex - single level



### How to do it



In [86]:
index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
ser = pd.Series(range(4), index=index)
ser

first_name  last_name
John        Smith        0
            Doe          1
Jane        Doe          2
Stephen     Smith        3
dtype: int64

In [87]:
ser.loc["John"]

last_name
Smith    0
Doe      1
dtype: int64

In [88]:
ser.loc[["John"]]

first_name  last_name
John        Smith        0
            Doe          1
dtype: int64

## Selection with a pd.MultiIndex - multiple levels



### How to do it



In [89]:
index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
ser = pd.Series(range(4), index=index)
ser

first_name  last_name
John        Smith        0
            Doe          1
Jane        Doe          2
Stephen     Smith        3
dtype: int64

In [90]:
ser.loc[("Jane", "Doe")]

2

In [91]:
ser.loc[(["Jane"], "Doe")]

first_name  last_name
Jane        Doe          2
dtype: int64

In [92]:
ser.loc[[("John", "Smith"), ("Jane", "Doe")]]

first_name  last_name
John        Smith        0
Jane        Doe          2
dtype: int64

In [93]:
ser.loc[(slice(None), "Doe")]

first_name
John    1
Jane    2
dtype: int64

In [94]:
ser.loc[(slice(None), ["Doe"])]

first_name  last_name
John        Doe          1
Jane        Doe          2
dtype: int64

In [95]:
alist = list("abc")
alist[:]

['a', 'b', 'c']

In [96]:
alist[slice(None)]

['a', 'b', 'c']

### There's more&#x2026;



In [97]:
ser.loc[(slice(None), ["Doe"])]

first_name  last_name
John        Doe          1
Jane        Doe          2
dtype: int64

In [98]:
ixsl = pd.IndexSlice
ser.loc[ixsl[:, ["Doe"]]]

first_name  last_name
John        Doe          1
Jane        Doe          2
dtype: int64

## Selection with a pd.MultiIndex - pd.DataFrame



### How to do it



In [99]:
row_index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
col_index = pd.MultiIndex.from_tuples([
    ("music", "favorite"),
    ("music", "last_seen_live"),
    ("art", "favorite"),
], names=["art_type", "category"])
df = pd.DataFrame([
   ["Swift", "Swift", "Matisse"],
   ["Mozart", "T. Swift", "Van Gogh"],
   ["Beatles", "Wonder", "Warhol"],
   ["Jackson", "Dylan", "Picasso"],
], index=row_index, columns=col_index)
df

Unnamed: 0_level_0,art_type,music,music,art
Unnamed: 0_level_1,category,favorite,last_seen_live,favorite
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
John,Smith,Swift,Swift,Matisse
John,Doe,Mozart,T. Swift,Van Gogh
Jane,Doe,Beatles,Wonder,Warhol
Stephen,Smith,Jackson,Dylan,Picasso


In [100]:
row_idxer = (slice(None), "Smith")
col_idxer = (slice(None), "favorite")
df.loc[row_idxer, col_idxer]

Unnamed: 0_level_0,art_type,music,art
Unnamed: 0_level_1,category,favorite,favorite
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2
John,Smith,Swift,Matisse
Stephen,Smith,Jackson,Picasso


In [101]:
df.loc[(slice(None), "Smith"), (slice(None), "favorite")]

Unnamed: 0_level_0,art_type,music,art
Unnamed: 0_level_1,category,favorite,favorite
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2
John,Smith,Swift,Matisse
Stephen,Smith,Jackson,Picasso


## Item Assignment with .loc and .iloc



### How to do it



In [102]:
ser = pd.Series(range(3), index=list("abc"))

In [103]:
ser.loc["b"] = 42
ser

a     0
b    42
c     2
dtype: int64

In [104]:
ser.iloc[2] = -42
ser

a     0
b    42
c   -42
dtype: int64

### There's more&#x2026;



## pd.DataFrame column assignment



### How to do it



In [105]:
df = pd.DataFrame({"col1": [1, 2, 3]})
df

Unnamed: 0,col1
0,1
1,2
2,3


In [106]:
df["new_column1"] = 42
df

Unnamed: 0,col1,new_column1
0,1,42
1,2,42
2,3,42


In [107]:
df["new_column2"] = list("abc")
df

Unnamed: 0,col1,new_column1,new_column2
0,1,42,a
1,2,42,b
2,3,42,c


In [108]:
df["new_column3"] = pd.Series(["dog", "cat", "human"])
df

Unnamed: 0,col1,new_column1,new_column2,new_column3
0,1,42,a,dog
1,2,42,b,cat
2,3,42,c,human


In [109]:
df["should_fail"] = ["too few", "rows"]

ValueError: Length of values (2) does not match length of index (3)

In [110]:
row_index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
col_index = pd.MultiIndex.from_tuples([
    ("music", "favorite"),
    ("music", "last_seen_live"),
    ("art", "favorite"),
], names=["art_type", "category"])
df = pd.DataFrame([
   ["Swift", "Swift", "Matisse"],
   ["Mozart", "T. Swift", "Van Gogh"],
   ["Beatles", "Wonder", "Warhol"],
   ["Jackson", "Dylan", "Picasso"],
], index=row_index, columns=col_index)
df

Unnamed: 0_level_0,art_type,music,music,art
Unnamed: 0_level_1,category,favorite,last_seen_live,favorite
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
John,Smith,Swift,Swift,Matisse
John,Doe,Mozart,T. Swift,Van Gogh
Jane,Doe,Beatles,Wonder,Warhol
Stephen,Smith,Jackson,Dylan,Picasso


In [111]:
df.loc[:, ("art", "museuems_seen")] = [1, 2, 4, 8]
df

Unnamed: 0_level_0,art_type,music,music,art,art
Unnamed: 0_level_1,category,favorite,last_seen_live,favorite,museuems_seen
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
John,Smith,Swift,Swift,Matisse,1
John,Doe,Mozart,T. Swift,Van Gogh,2
Jane,Doe,Beatles,Wonder,Warhol,4
Stephen,Smith,Jackson,Dylan,Picasso,8


### There's more



In [112]:
df = pd.DataFrame([[0, 1], [2, 4]], columns=list("ab"))
df

Unnamed: 0,a,b
0,0,1
1,2,4


In [113]:
(
    df
    .mul(2)
    .add(42)
)

Unnamed: 0,a,b
0,42,44
1,46,50


In [114]:
df2 = (
    df
    .mul(2)
    .add(42)
)
df2["assigned_c"] = df2["b"] - 3
df2

Unnamed: 0,a,b,assigned_c
0,42,44,41
1,46,50,47


In [115]:
(
    df
    .mul(2)
    .add(42)
    .assign(chained_c=lambda df: df["b"] - 3)
)

Unnamed: 0,a,b,chained_c
0,42,44,41
1,46,50,47
