# Aggregations: Min, Max, and Everything In Between

In [None]:
import numpy as np

In [None]:
L = np.random.random(100)

In [None]:
sum(L) - np.sum(L)

In [None]:
L = np.random.random(100000)

%timeit sum(L)
%timeit np.sum(L) 

In [None]:
%timeit min(L)
%timeit np.min(L)

## Multi dimensional aggregates¶
One common type of aggregation operation is an aggregate along a row or column

In [None]:
M = np.random.random((3,4))
M

for a bi-dimensional array returns the aggregate over the entire array --> max min

In [None]:
M.max(), '  ', M.sum(), '  ', np.add.reduce(M),'  ', np.add.reduce(M,axis=1)

#### REMINDER of computations and aggregates (previous notebook)
- How can I reproduce the .sum() of multi-dimensional, aggregating add ?
- add.accumulate(axix=0) adds elements along vertical axis returning **partial** sums
- add.reduce(axix=0) adds elements along vertical axis returning the **final** sums

In [None]:
M.sum(),'  ', np.add.accumulate(M),  '  ', np.add.reduce(M),  '  ', np.add.reduce(M,axis=1)

- you can reproduce the .sum() by reducing twice, along the two axes

In [None]:
%timeit M.sum()
%timeit np.add.reduce(np.add.reduce(M))

Aggregation functions take an additional argument specifying the axis along which the aggregate is computed. 

In [None]:
M.sum(axis=0), 'which is the same as', np.add.reduce(M)

### Other aggregation functions

In [None]:
M.prod(), '   ', M.prod(axis=1)

In [None]:
M.std()

In [None]:
M.argmin()

## president height exercise

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('data/president_heights.csv')
data.head(2)

In [None]:
# extract one column from dataframe into series
heights_series = data['height(cm)']
# extract one column from dataframe into ndarray
heights = np.array(data['height(cm)'])

print(type(heights_series),' ',type(heights))
print()
print(heights)

In [None]:
print("Mean height:       ", heights.mean())
print("Standard deviation:", heights.std())
print("Minimum height:    ", heights.min())
print("Maximum height:    ", heights.max())

# Selectors for ndarrays
two versions of a given selection: with the explicit function and the symbolic

In [None]:
test = np.array([1,2,20,3,4,5,20,6,7,20])
test==20

In [None]:
np.equal(test,20)

In [None]:
selection = test==20

In [None]:
np.argwhere(selection)

# argmax when there's more than one instance of max element

In [None]:
# NOOO THIS IDEA OF REMOVING ELEMENTS GIVES WRONG INDICES
def argmax_BAD(u):
    tmp      = list()
    print('initial array is: ',u)
    
    maxindex = u.argmax()
    maxvalue = u[maxindex]
    tmp.append(maxindex)
    print('tmp is: ',tmp)
    v   = np.delete(u,maxindex)
    
    print(v,'(purged) maxvalue was ',maxvalue,' at location: ',maxindex)

    while maxvalue in v:
        maxi = v.argmax()
        maxv = v[maxi]
        tmp.append(maxi)
        print('tmp is: ',tmp)
        v   = np.delete(v,maxi)
        print(v,'(purged) maxvalue was ', maxv,' at location: ',maxi)
        print(tmp)
        
    return tmp


# prototype of decorator function
def add_comments(func):
    # https://realpython.com/primer-on-python-decorators/#decorating-functions-with-arguments
    # if the function to be decorated has argument --> use the generic interface
    def wrapper(*args, **kwargs):
        print("Something is happening before the function is called.\n")
        print('dec: initial array is: ',args[0])
        func(*args, **kwargs,verb=True)
        # https://realpython.com/primer-on-python-decorators/#returning-values-from-decorated-functions
        print("Something is happening after the function is called.\n")
        # when the decorated function has a return statement, REMEMBER TO RETURN IT in the decorated version, too
        return func(*args, **kwargs)
    return wrapper

# @add_comments
def argmax_multi_lean(u,verb=False):
    
    maxindex = u.argmax()
    maxvalue = u.flatten('K')[maxindex]
    # by flattening we can find all indices corresponding to the max value
    toreturn = np.argwhere(u.flatten('K')==maxvalue)
    # Find the indices of array elements that are non-zero, grouped by element
    # argwhere: docs.scipy.org/doc/numpy/reference/generated/numpy.argwhere.html
    
    if verb:
        print('\n\ninitial array is: ',u)
        print('max found at position: ',maxindex)
        print('flattened is: %s'%u.flatten('K'))
        # also for multi-dimensional ndarrays, the argmax returned is consistent with flattening 'K'-style
        print('maxvalue is %s at location %s'%(maxvalue,maxindex))
        print('returning: %s\n'%toreturn.flatten('K'))

    return toreturn.flatten('K')

argmax_multi = add_comments(argmax_multi_lean)

In [None]:
test = np.array([1,2,20,3,4,5,20,6,7,20])
print('indices of max value are: ',argmax_multi(test))

In [None]:
mat = test.reshape(5,2)
mat

In [None]:
print('the outcome is: ')
print(argmax_multi_lean(mat))
print('outcome printed')

In [None]:
print("Mean height:       ", heights.mean())
print("Standard deviation:", heights.std())
print("Minimum height:    ", heights.min())
print("Maximum height:    ", heights.max())
print("Maximum heightS:    ", heights[argmax_multi_lean(heights)] )
print("Maximum height president-id's:    ", argmax_multi_lean(heights))

In [None]:
print("25th percentile:   ", np.percentile(heights, 25))
print("Median:            ", np.median(heights))
print("75th percentile:   ", np.percentile(heights, 75))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()  # set plot style
plt.hist(heights)
plt.title('Height Distribution of US Presidents')
plt.xlabel('height (cm)')
plt.ylabel('number');