## Use 'agg' method to summarize a table

In [2]:
import pandas as pd
import numpy as np
from collections import OrderedDict

PASS_GRADE = 75
li_grades = [80,90,80,70,np.nan,70,90,90,70,np.nan]

# Get values count for a list (first convert to Series and use dropna=False to get NaN counts)
print("Value counts for list of grades:")
print(pd.Series(li_grades).value_counts(dropna=False))

Value counts for list of grades:
 70.0    3
 90.0    3
NaN      2
 80.0    2
dtype: int64


In [5]:
# Note that 'agg' aggregates each column (feature) for each group, 
# so you end up with one value per column per group.

di1 = OrderedDict({'ID': ["x%d" % r for r in range(10)],
'ExamYear':['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'],
'Tested': ['yes','yes','yes','yes','no','yes','yes','yes','yes','no'],
'Passed': ['yes' if x > PASS_GRADE else 'no' for x in li_grades],
'Grade': li_grades})

di2 = OrderedDict({'Tested': lambda x: x.value_counts()['yes'],
'Passed': lambda x: sum(x == 'yes'),
'Grade': ['mean', 'std']})

df1 = pd.DataFrame(di1)
df2 = df1.groupby('ExamYear').agg(di2)
df2.columns = ['_'.join(col) for col in df2.columns]

print("Original table:")
print(df1)

print("---")
print("Summary table:")
print(df2)

Original table:
   ID ExamYear Tested Passed  Grade
0  x0     2007    yes    yes   80.0
1  x1     2007    yes    yes   90.0
2  x2     2007    yes    yes   80.0
3  x3     2008    yes     no   70.0
4  x4     2008     no     no    NaN
5  x5     2008    yes     no   70.0
6  x6     2008    yes    yes   90.0
7  x7     2009    yes    yes   90.0
8  x8     2009    yes     no   70.0
9  x9     2009     no     no    NaN
---
Summary table:
          Tested_<lambda>  Passed_<lambda>  Grade_mean  Grade_std
ExamYear                                                         
2007                    3                3   83.333333   5.773503
2008                    3                1   76.666667  11.547005
2009                    2                1   80.000000  14.142136
