# Examples in pandas using 'groupby' methods

In [93]:
# Build sample data
import pandas as pd
import numpy as np
from collections import OrderedDict

COLS = ['id', 'votes']
dat = [['a', 4], ['b', 2], ['b', 3], ['a', 2], ['a', 9]]
df1 = pd.DataFrame(dat, columns=COLS)

print('Initial dataframe')
print(df1)

Initial dataframe
  id  votes
0  a      4
1  b      2
2  b      3
3  a      2
4  a      9


## Augment existing dataframe using 'groupby' (same number of rows while adding columns)

In [94]:
# Add cumulative count
df1['id_cumcount'] = df1.groupby('id').cumcount() + 1  # counts start from 0

# Re-order columns and sort by id, then by id_cumcount
df1 = df1[['id', 'id_cumcount', 'votes']]
df1 = df1.sort_values(['id', 'id_cumcount'])
print(df1)

df = df1.copy()

  id  id_cumcount  votes
0  a            1      4
3  a            2      2
4  a            3      9
1  b            1      2
2  b            2      3


## Example use of 'apply' on groups using 'shift' and 'fillna'

In [95]:
# Find the previous value per group and calculate difference using shift function
df['votes_prev'] = df.groupby('id')['votes'].apply(lambda x: x.shift(1))
df['votes_prev'] = df['votes_prev'].fillna(0).astype(int)

def calc_diff(grp):
    grp['votes_diff'] = grp['votes'] - grp['votes_prev']
    return grp

# Build votes_diff at the differnce between the current row's votes minus the previous row per group
df['votes_diff'] = df.groupby('id').apply(calc_diff)['votes_diff']

# Fill missing values with 0 and convert floats to ints
df['votes_diff'] = df['votes_diff'].fillna(0).astype(int)

# Sort by group and column values
df = df.reset_index(drop=True)
print(df)

  id  id_cumcount  votes  votes_prev  votes_diff
0  a            1      4           0           4
1  a            2      2           4          -2
2  a            3      9           2           7
3  b            1      2           0           2
4  b            2      3           2           1


## Use 'transform' method to add new columns based on existing columns

In [96]:
# Build stats for all rows in each group and assign to all rows
COL_STATS = ["sum", "count", "median", "mean", "std"]
for v in COL_STATS:
    df['grp_' + v] = df.groupby('id')['votes'].transform(v)

df['grp_std_score'] = df.groupby('id')['votes'].transform(lambda x: (x - x.mean()) / x.std())

print(df.to_string())

  id  id_cumcount  votes  votes_prev  votes_diff  grp_sum  grp_count  grp_median  grp_mean   grp_std  grp_std_score
0  a            1      4           0           4       15          3         4.0       5.0  3.605551      -0.277350
1  a            2      2           4          -2       15          3         4.0       5.0  3.605551      -0.832050
2  a            3      9           2           7       15          3         4.0       5.0  3.605551       1.109400
3  b            1      2           0           2        5          2         2.5       2.5  0.707107      -0.707107
4  b            2      3           2           1        5          2         2.5       2.5  0.707107       0.707107


In [97]:
# Print just the first value in each group
print(df.groupby('id').first().to_string())

    id_cumcount  votes  votes_prev  votes_diff  grp_sum  grp_count  grp_median  grp_mean   grp_std  grp_std_score
id                                                                                                               
a             1      4           0           4       15          3         4.0       5.0  3.605551      -0.277350
b             1      2           0           2        5          2         2.5       2.5  0.707107      -0.707107


In [98]:
# Print just the last value in each group
print(df.groupby('id').last().to_string())

    id_cumcount  votes  votes_prev  votes_diff  grp_sum  grp_count  grp_median  grp_mean   grp_std  grp_std_score
id                                                                                                               
a             3      9           2           7       15          3         4.0       5.0  3.605551       1.109400
b             2      3           2           1        5          2         2.5       2.5  0.707107       0.707107


## More dataframe augmentation - 'smear' values within a group

In [99]:
# Get a subset of dataframe columns
df = df[['id', 'votes', 'votes_prev', 'votes_diff']]
print(df)

  id  votes  votes_prev  votes_diff
0  a      4           0           4
1  a      2           4          -2
2  a      9           2           7
3  b      2           0           2
4  b      3           2           1


In [100]:
# Get rows that match a single group (group b)
gb = df.groupby(['id'])
print("Number of groups:", len(df.groupby(['id'])))
print("DataFrame where group id is b:")
print(gb.get_group('b'))

Number of groups: 2
DataFrame where group id is b:
  id  votes  votes_prev  votes_diff
3  b      2           0           2
4  b      3           2           1


In [101]:
# Build new column with the last value of votes_diff per group 'smeared' back to all rows in the corresponding group
def calc_last_diff(grp):
    grp['last_diff'] = grp.iloc[-1]['votes_diff']
    return grp

df['last_diff'] = df.groupby('id').apply(calc_last_diff)['last_diff']
print(df.to_string())

  id  votes  votes_prev  votes_diff  last_diff
0  a      4           0           4          7
1  a      2           4          -2          7
2  a      9           2           7          7
3  b      2           0           2          1
4  b      3           2           1          1


In [102]:
# Set series value based on each row within group
def calc_is_last_diff(grp):
    grp['is_last_diff'] = grp['votes_diff'] == grp['last_diff']
    # OR: grp['is_last_diff'] = grp.apply(lambda r:  r['votes_diff'] == r['last_diff'], axis=1)    
    return grp

df['is_last_diff'] = df.groupby('id').apply(calc_is_last_diff)['is_last_diff']
print(df.to_string())

  id  votes  votes_prev  votes_diff  last_diff  is_last_diff
0  a      4           0           4          7         False
1  a      2           4          -2          7         False
2  a      9           2           7          7          True
3  b      2           0           2          1         False
4  b      3           2           1          1          True


## Iterate over groups and then over rows within each group

In [103]:
igr = 0
for name, gr in df.groupby('id'): 
    # Iterate within group
    print("index: %d, group name: %s" % (igr, name))
    for i,x in gr.iterrows():
        print("row idx: %d, votes_diff: %.3f" % (i, x['votes_diff']))
    
    print()
    igr += 1

index: 0, group name: a
row idx: 0, votes_diff: 4.000
row idx: 1, votes_diff: -2.000
row idx: 2, votes_diff: 7.000

index: 1, group name: b
row idx: 3, votes_diff: 2.000
row idx: 4, votes_diff: 1.000



## Filter dataframe based on multiple column values

In [104]:
def remove_rows(df):
    remaining_rows = df['votes_diff'] == df['last_diff']
    return df[remaining_rows]

df_copy = df.copy()

# with group_keys=True
print(df.groupby("id", group_keys=True).apply(remove_rows))
print('---')

# with group_keys=False
df = df.groupby("id", group_keys=False).apply(remove_rows)
print(df)

     id  votes  votes_prev  votes_diff  last_diff  is_last_diff
id                                                             
a  2  a      9           2           7          7          True
b  4  b      3           2           1          1          True
---
  id  votes  votes_prev  votes_diff  last_diff  is_last_diff
2  a      9           2           7          7          True
4  b      3           2           1          1          True
