In [None]:
import numpy as np
import pandas as pd

# Create fictive dataset about Country exports

In [None]:
COLUMNS = ['Agriculture', 'Beverages', 'Cosmetics']

N_COLS = len(COLUMNS)
N_ROWS = 50

COUNTRIES = np.array([
    'UK', 'US', 'EU', 'JP', 'RU', 'TK', 'PRC', 'MX'
])

In [None]:
np.random.seed(42)

df = pd.DataFrame(np.random.randint(0, 10, size=(
    N_ROWS, N_COLS)), columns=COLUMNS)
df.index = COUNTRIES[np.random.randint(0, len(COUNTRIES), N_ROWS)]

In [None]:
df.head()

# Sales grouped by country
If we are interested in the data on the level of countries we can use the `DataFrame.grouby` command ([documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html)) to group data on that level, which returns a *Groupby object*. This object facilitates further computations. For a nice overview of groupby and aggregation see this [page](https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/).

In [None]:
grouped = df.groupby(df.index)
print(grouped)

## groupby sum

In [None]:
grouped.sum()  # prints the sum of sales grouped by country

## groupby max

In [None]:
grouped.max()

## groupby mean

In [None]:
grouped.mean()

## groupby groups

In [None]:
grouped.groups

## Get a specific group

In [None]:
grouped.get_group('EU')

## grouped size

In [None]:
grouped.size()

## grouped describe

In [None]:
grouped.describe()

## Aggregated statistics
To apply multiple functions to a single column in your grouped data we can use the `.agg` method.

In [None]:
grouped.agg({'Cosmetics': [min, max, sum]})

## grouped transform

In [None]:
def zscore(group):
    return (group - group.mean()) / group.std()

trans = grouped.transform(zscore)
trans.head()

## grouped filter

In [None]:
def filter_out_low_mean(group):
    return group['Cosmetics'].mean() > 5

# select countries with an average cosmetics sales above the given threshold
print(grouped.filter(filter_out_low_mean))

# Exercise
Select those countries with more than 5 samples. For that selection compute the standard deviation over the 'Agriculture' column.

# Exercise
You can loop over a grouped object and perform an operation on each group: `for label, group in grouped:`

Loop over each group and compute the sum of the 'Beverages' column.