In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Create fictive dataset about Country exports

In [None]:
COLUMNS = ['Agriculture', 'Beverages', 'Cosmetics']

N_COLS = len(COLUMNS)
N_ROWS = 50

COUNTRIES = np.array([
    'UK', 'US', 'EU', 'JP', 'RU', 'TK', 'PRC', 'MX'
])

In [None]:
df = pd.DataFrame(np.random.randint(0, 10, size=(
    N_ROWS, N_COLS)), columns=COLUMNS)
df.index = COUNTRIES[np.random.randint(0, len(COUNTRIES), N_ROWS)]

In [None]:
df.head()

# Sales grouped by country
If we are interested in the data on the level of countries we can use the `DataFrame.grouby` command ([documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html)) to group data on that level, which returns a *Groupby object*. This object facilitates further computations. For a nice overview of groupby and aggregation see this [page](https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/).

In [None]:
grouped = df.groupby(df.index)
print(grouped)

## groupby sum

In [None]:
grouped.sum()  # prints the sum of sales grouped by country

## groupby max

In [None]:
grouped.max()

## groupby mean

In [None]:
grouped.mean()

## groupby groups

In [None]:
grouped.groups

## Get a specific group

In [None]:
grouped.get_group('EU')

## grouped size

In [None]:
grouped.size()

## grouped describe

In [None]:
grouped.describe()

## Aggregated statistics
To apply multiple functions to a single column in your grouped data we can use the `.agg` method.

In [None]:
grouped.agg({'Cosmetics': [min, max, sum]})

## grouped transform

In [None]:
def zscore(group):
    return (group - group.mean()) / group.std()

trans = grouped.transform(zscore)
trans.head()

## grouped filter

In [None]:
def filter_out_low_mean(group):
    return group['Cosmetics'].mean() > 5

# select countries with an average cosmetics sales above the given threshold
print(grouped.filter(filter_out_low_mean))

# Exercise
Select those countries with more than 5 samples. For that selection compute the standard deviation over the 'Agriculture' column.

# Plotting with a DataFrame

vanilla `.plot` command

In [None]:
df.plot()  # what do we see, does it make sense?

In [None]:
grouped.plot()  # what do we see, does it make sense?

## histogram
Let's plot the histograms stacked for our columns.

In [None]:
df.plot(kind='hist', stacked=True)

## bar plot
Let's plot the averages for our columns per country as stacked bar plots.

In [None]:
grouped.mean().plot(kind='bar', stacked=True)

## box plot
Let's plot boxplots for our columns

In [None]:
df.plot(kind='box')

## scatter plot
Let's scatter two of our columns against each other.

In [None]:
df.plot(x='Agriculture', y='Cosmetics', kind='scatter')

# Advanced plotting

## scatter_matrix
A scatter matrix gives quick insight in the distribution of our variables.
[documentation](https://pandas.pydata.org/pandas-docs/stable/visualization.html#visualization-scatter-matrix)

In [None]:
pd.scatter_matrix(df)

## kernel density plot
[documentation](https://pandas.pydata.org/pandas-docs/stable/visualization.html#density-plot)

In [None]:
df.plot.kde()

# Exercise
Plot the total per column (category, y-axis) sales per country (x-axis) superimposed as a line plot.