# Aggregating DataFrames

### Summary statistics

#### Summarizing numerical data

In [11]:
import pandas as pd
df = pd.read_csv("../datasets/homelessness.csv")
d2 = pd.read_pickle("../datasets/avoplotto.pkl")
df

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
0,0,East South Central,Alabama,2570.0,864.0,4887681
1,1,Pacific,Alaska,1434.0,582.0,735139
2,2,Mountain,Arizona,7259.0,2606.0,7158024
3,3,West South Central,Arkansas,2280.0,432.0,3009733
4,4,Pacific,California,109008.0,20964.0,39461588
5,5,Mountain,Colorado,7607.0,3250.0,5691287
6,6,New England,Connecticut,2280.0,1696.0,3571520
7,7,South Atlantic,Delaware,708.0,374.0,965479
8,8,South Atlantic,District of Columbia,3770.0,3134.0,701547
9,9,South Atlantic,Florida,21443.0,9587.0,21244317


In [6]:
d2["avg_price"].mean()

1.3190236686390533

* `median()`
* `mode()`
* `min()`
* `max()`
* `var()`
* `std()`
* `sum()`
* `quantile()`

#### Summarizing dates
Oldest dog:

In [7]:
d2['date'].min()

'2015-01-04'

In [8]:
d2['date'].max()

'2018-03-25'

#### The .agg() method

In [9]:
def pct30(column):
    return column.quantile(0.3)

In [10]:
d2['avg_price'].agg(pct30)

1.07

#### Summaries on multiple columns

In [12]:
df[['individuals', 'family_members']].agg(pct30)

individuals       1745.0
family_members     676.0
dtype: float64

#### Multiple summaries

In [13]:
def pct40(column):
    return column.quantile(0.4)

In [14]:
df['individuals'].agg([pct30, pct40])

pct30    1745.0
pct40    2540.0
Name: individuals, dtype: float64

#### Cumulative sum

In [15]:
df['individuals'].cumsum()

0       2570.0
1       4004.0
2      11263.0
3      13543.0
4     122551.0
5     130158.0
6     132438.0
7     133146.0
8     136916.0
9     158359.0
10    165302.0
11    169433.0
12    170730.0
13    177482.0
14    181258.0
15    182969.0
16    184412.0
17    187147.0
18    189687.0
19    191137.0
20    196051.0
21    202862.0
22    208071.0
23    212064.0
24    213088.0
25    216864.0
26    217847.0
27    219592.0
28    226650.0
29    227485.0
30    233533.0
31    235482.0
32    275309.0
33    281760.0
34    282227.0
35    289156.0
36    291979.0
37    303118.0
38    311281.0
39    312028.0
40    315110.0
41    315946.0
42    322085.0
43    341284.0
44    343188.0
45    343968.0
46    347896.0
47    364320.0
48    365341.0
49    368081.0
50    368515.0
Name: individuals, dtype: float64

#### Cumulative statistics

* `.cummax()`
* `.cummin()`
* `.cumprod()`