# Aggregations: Min, Max, and Everything In Between

In [5]:
import numpy as np

In [6]:
L = np.random.random(100)

In [7]:
sum(L) - np.sum(L)

-1.4210854715202004e-14

## Summing the Values in an Array
NumPy's version of the operation is computed much more quickly:

In [8]:
L = np.random.random(100000)

%timeit sum(L)
%timeit np.sum(L) 

25.4 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
47.7 µs ± 981 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [9]:
%timeit min(L)
%timeit np.min(L)

11 ms ± 674 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
46.7 µs ± 4.78 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Multi dimensional aggregates¶
One common type of aggregation operation is an aggregate along a row or column

In [10]:
M = np.random.random((3,4))
M

array([[0.13304504, 0.21036821, 0.26653586, 0.41499393],
       [0.41609543, 0.86414852, 0.67047399, 0.76684741],
       [0.84513248, 0.89265261, 0.61153354, 0.1287356 ]])

for a bi-dimensional array returns the aggregate over the entire array --> max min

In [11]:
M.max(), '  ', M.sum(), '  ', np.add.reduce(M),'  ', np.add.reduce(M,axis=1)

(0.8926526104494674,
 '  ',
 6.2205626294605985,
 '  ',
 array([1.39427295, 1.96716935, 1.54854339, 1.31057695]),
 '  ',
 array([1.02494304, 2.71756535, 2.47805424]))

#### REMINDER of computations and aggregates (previous notebook)
- How can I reproduce the .sum() of multi-dimensional, aggregating add ?
- add.accumulate(axix=0) adds elements along vertical axis returning **partial** sums
- add.reduce(axix=0) adds elements along vertical axis returning the **final** sums

In [12]:
M.sum(),'  ', np.add.accumulate(M),  '  ', np.add.reduce(M),  '  ', np.add.reduce(M,axis=1)

(6.2205626294605985,
 '  ',
 array([[0.13304504, 0.21036821, 0.26653586, 0.41499393],
        [0.54914046, 1.07451674, 0.93700985, 1.18184134],
        [1.39427295, 1.96716935, 1.54854339, 1.31057695]]),
 '  ',
 array([1.39427295, 1.96716935, 1.54854339, 1.31057695]),
 '  ',
 array([1.02494304, 2.71756535, 2.47805424]))

- you can reproduce the .sum() by reducing twice, along the two axes

In [13]:
%timeit M.sum()
%timeit np.add.reduce(np.add.reduce(M))

3.99 µs ± 681 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.01 µs ± 73.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Aggregation functions take an additional argument specifying the axis along which the aggregate is computed. 

In [14]:
M.sum(axis=0), 'which is the same as', np.add.reduce(M)

(array([1.39427295, 1.96716935, 1.54854339, 1.31057695]),
 'which is the same as',
 array([1.39427295, 1.96716935, 1.54854339, 1.31057695]))

### Other aggregation functions

In [15]:
M.prod(), '   ', M.prod(axis=1)

(3.399183726687367e-05, '   ', array([0.00309582, 0.18487246, 0.05939176]))

In [16]:
M.std()

0.27983664876753495

In [17]:
M.argmin()

11

## president height exercise

In [18]:
import pandas as pd

In [22]:
data = pd.read_csv('data/president_heights.csv')
data.head(2)

(pandas.core.series.Series, ' ', numpy.ndarray)

In [25]:
# extract one column from dataframe into series
heights_series = data['height(cm)']
# extract one column from dataframe into ndarray
heights = np.array(data['height(cm)'])

print(type(heights_series),' ',type(heights))
print()
print(heights)

<class 'pandas.core.series.Series'>   <class 'numpy.ndarray'>

[189 170 189 163 183 171 185 168 173 183 173 173 175 178 183 193 178 173
 174 183 183 168 170 178 182 180 183 178 182 188 175 179 183 193 182 183
 177 185 188 188 182 185]


In [47]:
print("Mean height:       ", heights.mean())
print("Standard deviation:", heights.std())
print("Minimum height:    ", heights.min())
print("Maximum height:    ", heights.max())

# NOOO THIS IDEA OF REMOVING ELEMENTS GIVES WRONG INDICES
def argmax_multiple(u):
    tmp      = list()
    print('initial array is: ',u)
    
    maxindex = u.argmax()
    maxvalue = u[maxindex]
    tmp.append(maxindex)
    print('tmp is: ',tmp)
    v   = np.delete(u,maxindex)
    
    print(v,'(purged) maxvalue was ',maxvalue,' at location: ',maxindex)

    while maxvalue in v:
        maxi = v.argmax()
        maxv = v[maxi]
        tmp.append(maxi)
        print('tmp is: ',tmp)
        v   = np.delete(v,maxi)
        print(v,'(purged) maxvalue was ', maxv,' at location: ',maxi)
        print(tmp)
        
    return tmp

test = np.array([1,2,20,3,4,5,20,6,7,20])
print('indices of max value are: ',argmax_multiple(test))
#print('Tallest presidents:', heights.argmax() )

Mean height:        179.73809523809524
Standard deviation: 6.931843442745892
Minimum height:     163
Maximum height:     193
initial array is:  [ 1  2 20  3  4  5 20  6  7 20]
tmp is:  [2]
[ 1  2  3  4  5 20  6  7 20] (purged) maxvalue was  20  at location:  2
tmp is:  [2, 5]
[ 1  2  3  4  5  6  7 20] (purged) maxvalue was  20  at location:  5
[2, 5]
tmp is:  [2, 5, 7]
[1 2 3 4 5 6 7] (purged) maxvalue was  20  at location:  7
[2, 5, 7]
indices of max value are:  [2, 5, 7]


In [28]:
np.argmax?