In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({"key1": ["a", "a", None, "b", "b", "a", None],
                   "key2": pd.Series([1, 2, 1, 2, 1, None, 1], dtype = "Int64"),
                   "data1": np.random.standard_normal(7),
                   "data2": np.random.standard_normal(7)})

df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-1.661697,-1.853987
1,a,2.0,-1.157542,-0.714435
2,,1.0,0.47605,-0.568912
3,b,2.0,-0.025858,-0.518657
4,b,1.0,-0.668069,1.258287
5,a,,-0.565034,2.367741
6,,1.0,-1.448914,0.059057


In [3]:
grouped = df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001F2E0AB53D0>

In [4]:
grouped.mean()

key1
a   -1.128091
b   -0.346963
Name: data1, dtype: float64

In [6]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
means

key1  key2
a     1      -1.661697
      2      -1.157542
b     1      -0.668069
      2      -0.025858
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.661697,-1.157542
b,-0.668069,-0.025858


In [8]:
states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

df["data1"].groupby([states, years]).mean()

CA  2005   -0.861288
    2006    0.476050
OH  2005   -0.843777
    2006   -1.058491
Name: data1, dtype: float64

In [10]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-1.128091,-0.066893
b,1.5,-0.346963,0.369815


In [11]:
df.groupby("key2").mean(numeric_only = True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.825657,-0.276389
2,-0.5917,-0.616546


In [12]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-1.661697,-1.853987
a,2,-1.157542,-0.714435
b,1,-0.668069,1.258287
b,2,-0.025858,-0.518657


In [13]:
df.groupby(["key1", "key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [15]:
df.groupby("key1", dropna = False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [16]:
df.groupby(["key1", "key2"], dropna = False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [17]:
df.groupby("key1").count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [18]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -1.661697 -1.853987
1    a     2 -1.157542 -0.714435
5    a  <NA> -0.565034  2.367741
b
  key1  key2     data1     data2
3    b     2 -0.025858 -0.518657
4    b     1 -0.668069  1.258287


In [19]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2     data1     data2
0    a     1 -1.661697 -1.853987
('a', 2)
  key1  key2     data1     data2
1    a     2 -1.157542 -0.714435
('b', 1)
  key1  key2     data1     data2
4    b     1 -0.668069  1.258287
('b', 2)
  key1  key2     data1     data2
3    b     2 -0.025858 -0.518657


In [20]:
pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,-0.025858,-0.518657
4,b,1,-0.668069,1.258287


In [24]:
grouped = df.T.groupby({"key1": "key", "key2": "key",
                      "data1": "data", "data2": "data"})

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F2E15F2360>

In [25]:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
              0         1         2         3         4         5         6
data1 -1.661697 -1.157542   0.47605 -0.025858 -0.668069 -0.565034 -1.448914
data2 -1.853987 -0.714435 -0.568912 -0.518657  1.258287  2.367741  0.059057
key
      0  1     2  3  4     5     6
key1  a  a  None  b  b     a  None
key2  1  2     1  2  1  <NA>     1


In [28]:
df.groupby("key1")["data1"]
df.groupby("key1")[["data2"]]

df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,-1.853987
a,2,-0.714435
b,1,1.258287
b,2,-0.518657


In [31]:
df.groupby("key2").agg(mean1 = ("data1", "mean"), mean2 = ("data2", "mean")).reset_index()

Unnamed: 0,key2,mean1,mean2
0,1,-0.825657,-0.276389
1,2,-0.5917,-0.616546
