# 10.1 GroupBy Mechanics

In [53]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1': ['a','a','b','b','a','a'],
    'key2': ['one','two','one','two','three','one'],
    'd1': np.random.randn(6),
    'd2': np.random.randn(6)
})
df

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.201784,0.367019
1,a,two,-0.140575,0.237594
2,b,one,-2.734993,0.662153
3,b,two,-0.994827,1.475758
4,a,three,-1.53866,-1.06128
5,a,one,-0.07644,-0.345571


In [3]:
grouped = df['d1'].groupby(df['key1'])
grouped1 = df['d2'].groupby(df['key2'])
grouped2 = df['d1'].groupby([df['key1'], df['key2']]).mean()
grouped4 = df.groupby(df['key1'])


In [4]:
grouped.mean()

key1
a   -0.489365
b   -1.864910
Name: d1, dtype: float64

In [5]:
grouped1.mean()

key2
one      0.227867
three   -1.061280
two      0.856676
Name: d2, dtype: float64

In [6]:
grouped2

key1  key2 
a     one     -0.139112
      three   -1.538660
      two     -0.140575
b     one     -2.734993
      two     -0.994827
Name: d1, dtype: float64

In [7]:
grouped4.mean()

Unnamed: 0_level_0,d1,d2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.489365,-0.200559
b,-1.86491,1.068955


In [8]:
print(df)
print('\n')
print('\n')
for (k1,k2), group in df.groupby(['key1','key2']):
#     print(k1)
#     print(k2)
    print(group)

  key1   key2        d1        d2
0    a    one -0.201784  0.367019
1    a    two -0.140575  0.237594
2    b    one -2.734993  0.662153
3    b    two -0.994827  1.475758
4    a  three -1.538660 -1.061280
5    a    one -0.076440 -0.345571




  key1 key2        d1        d2
0    a  one -0.201784  0.367019
5    a  one -0.076440 -0.345571
  key1   key2       d1       d2
4    a  three -1.53866 -1.06128
  key1 key2        d1        d2
1    a  two -0.140575  0.237594
  key1 key2        d1        d2
2    b  one -2.734993  0.662153
  key1 key2        d1        d2
3    b  two -0.994827  1.475758


In [9]:
pieces = dict(list(df.groupby('key1')))

In [10]:
pieces['a']

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.201784,0.367019
1,a,two,-0.140575,0.237594
4,a,three,-1.53866,-1.06128
5,a,one,-0.07644,-0.345571


In [11]:
pieces['b']

Unnamed: 0,key1,key2,d1,d2
2,b,one,-2.734993,0.662153
3,b,two,-0.994827,1.475758


In [12]:
# we could group the columns of our example df here by dtype

In [13]:
df.dtypes

key1     object
key2     object
d1      float64
d2      float64
dtype: object

In [14]:
groupByType = df.groupby(df.dtypes, axis=1)

In [15]:
groupByType

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f41bbd22550>

In [16]:
for dtype, group in groupByType:
    print(dtype)
    print(group)    

float64
         d1        d2
0 -0.201784  0.367019
1 -0.140575  0.237594
2 -2.734993  0.662153
3 -0.994827  1.475758
4 -1.538660 -1.061280
5 -0.076440 -0.345571
object
  key1   key2
0    a    one
1    a    two
2    b    one
3    b    two
4    a  three
5    a    one


In [17]:
df.groupby('key1')['d1'].mean()

key1
a   -0.489365
b   -1.864910
Name: d1, dtype: float64

In [18]:
df.groupby(['key1','key2'])['d1','d2'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,d1,d2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.139112,0.010724
a,three,-1.53866,-1.06128
a,two,-0.140575,0.237594
b,one,-2.734993,0.662153
b,two,-0.994827,1.475758


In [19]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,d1,d2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.139112,0.010724
a,three,-1.53866,-1.06128
a,two,-0.140575,0.237594
b,one,-2.734993,0.662153
b,two,-0.994827,1.475758


In [20]:
df

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.201784,0.367019
1,a,two,-0.140575,0.237594
2,b,one,-2.734993,0.662153
3,b,two,-0.994827,1.475758
4,a,three,-1.53866,-1.06128
5,a,one,-0.07644,-0.345571


# Use loc[] to choose rows and columns by label.
 

# Use iloc[] to choose rows and columns by position.

In [21]:
#2:3 can be changed to [2]-> which rows to be taken
#[0,1,3,2] which columns to take into account

df.iloc[2:3,[0,1,2,3]]

Unnamed: 0,key1,key2,d1,d2
2,b,one,-2.734993,0.662153


In [22]:
df.iloc[0:2,1:2]

Unnamed: 0,key2
0,one
1,two


In [23]:
df.iloc[0:2,[1,2]]

Unnamed: 0,key2,d1
0,one,-0.201784
1,two,-0.140575


In [24]:
df

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.201784,0.367019
1,a,two,-0.140575,0.237594
2,b,one,-2.734993,0.662153
3,b,two,-0.994827,1.475758
4,a,three,-1.53866,-1.06128
5,a,one,-0.07644,-0.345571


In [25]:
df.set_index('key1', inplace=True)
df

Unnamed: 0_level_0,key2,d1,d2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.201784,0.367019
a,two,-0.140575,0.237594
b,one,-2.734993,0.662153
b,two,-0.994827,1.475758
a,three,-1.53866,-1.06128
a,one,-0.07644,-0.345571


In [26]:
df.loc[:,'d1']

key1
a   -0.201784
a   -0.140575
b   -2.734993
b   -0.994827
a   -1.538660
a   -0.076440
Name: d1, dtype: float64

In [27]:
df.loc['b',:]

Unnamed: 0_level_0,key2,d1,d2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b,one,-2.734993,0.662153
b,two,-0.994827,1.475758


In [28]:
df.loc['b','d1']

key1
b   -2.734993
b   -0.994827
Name: d1, dtype: float64

In [29]:
df.loc[['a','b'],['d1','key2']]

Unnamed: 0_level_0,d1,key2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.201784,one
a,-0.140575,two
a,-1.53866,three
a,-0.07644,one
b,-2.734993,one
b,-0.994827,two


In [46]:
# select rows based on columnn value
df.loc[df.loc[:,'d1']>-1.0,:]

Unnamed: 0_level_0,key2,d1,d2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.201784,0.367019
a,two,-0.140575,0.237594
b,two,-0.994827,1.475758
a,one,-0.07644,-0.345571


In [30]:
#Grouping with Dicts and Series

In [31]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index=['J','B','K','L','T'])

In [32]:
people

Unnamed: 0,a,b,c,d,e
J,-1.147828,2.350837,-0.130697,0.56855,-0.114935
B,1.203096,-1.406303,0.012428,0.621577,-0.943408
K,0.506795,-1.127157,0.030486,0.432494,-0.862098
L,-0.373488,0.419038,0.356795,1.048905,-1.031176
T,0.321091,-1.766021,0.285328,1.411598,1.012981


In [33]:
people.iloc[2:3,[1,2]]=np.nan

In [34]:
people

Unnamed: 0,a,b,c,d,e
J,-1.147828,2.350837,-0.130697,0.56855,-0.114935
B,1.203096,-1.406303,0.012428,0.621577,-0.943408
K,0.506795,,,0.432494,-0.862098
L,-0.373488,0.419038,0.356795,1.048905,-1.031176
T,0.321091,-1.766021,0.285328,1.411598,1.012981


In [35]:
# Now, suppose I have a group correspondence for the columns and want to sum
# together the columns by group

In [36]:
mapping={'a':'red','b':'green','c':'blue','d':'yellow','e':'orange','f':'maroon'}

In [37]:
mapping

{'a': 'red',
 'b': 'green',
 'c': 'blue',
 'd': 'yellow',
 'e': 'orange',
 'f': 'maroon'}

In [38]:
# you could construct an array from this dict to pass to groupby , but instead we
# can just pass the dict (I included the key 'f' to highlight that unused grouping keys
# are OK)
byColumn = people.groupby(mapping, axis=1)


In [39]:
byColumn.sum()

Unnamed: 0,blue,green,orange,red,yellow
J,-0.130697,2.350837,-0.114935,-1.147828,0.56855
B,0.012428,-1.406303,-0.943408,1.203096,0.621577
K,0.0,0.0,-0.862098,0.506795,0.432494
L,0.356795,0.419038,-1.031176,-0.373488,1.048905
T,0.285328,-1.766021,1.012981,0.321091,1.411598


In [40]:
map_series = pd.Series(mapping)

In [41]:
map_series

a       red
b     green
c      blue
d    yellow
e    orange
f    maroon
dtype: object

In [42]:
bySeries = people.groupby(map_series, axis=1)

In [43]:
bySeries.sum()

Unnamed: 0,blue,green,orange,red,yellow
J,-0.130697,2.350837,-0.114935,-1.147828,0.56855
B,0.012428,-1.406303,-0.943408,1.203096,0.621577
K,0.0,0.0,-0.862098,0.506795,0.432494
L,0.356795,0.419038,-1.031176,-0.373488,1.048905
T,0.285328,-1.766021,1.012981,0.321091,1.411598


In [58]:
# Practice
# selecting subsets from dataframe
dataFrames = pd.DataFrame({
    'state': ['NY','TX','FL','AL','AK','TX','TX'],
    'color': ['blue','green','red','white','grey','yellow','orange'],
    'food': ['Steak','Dal','Lamb','Pork','Beef','Burger','Pasta'],
    'age':[30,21,12,36,45,90,54],
    'height':[165,170,185,140,190,163,178],
    'score':[5.5,4.3,1.2,9.6,8.3,6.5,7.3]
}, index = ['Jane','Niko','Aaron','Penelope','Dean','Cristina','Cornella'])

In [60]:
dataFrames

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,5.5
Niko,TX,green,Dal,21,170,4.3
Aaron,FL,red,Lamb,12,185,1.2
Penelope,AL,white,Pork,36,140,9.6
Dean,AK,grey,Beef,45,190,8.3
Cristina,TX,yellow,Burger,90,163,6.5
Cornella,TX,orange,Pasta,54,178,7.3


In [67]:
dataFrames.loc[:,['color','age','height']]

Unnamed: 0,color,age,height
Jane,blue,30,165
Niko,green,21,170
Aaron,red,12,185
Penelope,white,36,140
Dean,grey,45,190
Cristina,yellow,90,163
Cornella,orange,54,178


In [68]:
dataFrames.loc[['Aaron','Dean'],:]

Unnamed: 0,state,color,food,age,height,score
Aaron,FL,red,Lamb,12,185,1.2
Dean,AK,grey,Beef,45,190,8.3


In [69]:
dataFrames.loc[['Aaron','Dean'],['color','age','height']]

Unnamed: 0,color,age,height
Aaron,red,12,185
Dean,grey,45,190


In [70]:
index = dataFrames.index
columns = dataFrames.columns
values = dataFrames.values

In [71]:
index

Index(['Jane', 'Niko', 'Aaron', 'Penelope', 'Dean', 'Cristina', 'Cornella'], dtype='object')

In [72]:
columns

Index(['state', 'color', 'food', 'age', 'height', 'score'], dtype='object')

In [73]:
values

array([['NY', 'blue', 'Steak', 30, 165, 5.5],
       ['TX', 'green', 'Dal', 21, 170, 4.3],
       ['FL', 'red', 'Lamb', 12, 185, 1.2],
       ['AL', 'white', 'Pork', 36, 140, 9.6],
       ['AK', 'grey', 'Beef', 45, 190, 8.3],
       ['TX', 'yellow', 'Burger', 90, 163, 6.5],
       ['TX', 'orange', 'Pasta', 54, 178, 7.3]], dtype=object)

In [75]:
dataFrames

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,5.5
Niko,TX,green,Dal,21,170,4.3
Aaron,FL,red,Lamb,12,185,1.2
Penelope,AL,white,Pork,36,140,9.6
Dean,AK,grey,Beef,45,190,8.3
Cristina,TX,yellow,Burger,90,163,6.5
Cornella,TX,orange,Pasta,54,178,7.3


In [77]:
dataFrames.iloc[:,[1,2,4]]

Unnamed: 0,color,food,height
Jane,blue,Steak,165
Niko,green,Dal,170
Aaron,red,Lamb,185
Penelope,white,Pork,140
Dean,grey,Beef,190
Cristina,yellow,Burger,163
Cornella,orange,Pasta,178


In [79]:
dataFrames.iloc[[2,4],:]

Unnamed: 0,state,color,food,age,height,score
Aaron,FL,red,Lamb,12,185,1.2
Dean,AK,grey,Beef,45,190,8.3


In [80]:
dataFrames.iloc[[2,4],[1,2,4]]

Unnamed: 0,color,food,height
Aaron,red,Lamb,185
Dean,grey,Beef,190
