In [None]:
!pip install otter-grader
!git clone https://github.com/chandar-lab/INF8245e-assignments-public.git public

In [None]:
# Initialize Otter
import otter
grader = otter.Notebook(colab=True, tests_dir='./public/assignment_0_python/tests')

# Tutorial 3: Python

## Outline:

1. **Basics**
    * Basic data types
    * Containers
        * Lists
        * Dictionaries
        * Sets
    * Functions
2. **Numpy**
    * Arrays
    * Array indexing
    * Datat types
    * Array math
3. **Pandas**
    * Creating a dataframe
    * Dataframe subsets
    * Aggregation
    * Long and wide format
4. **Plotnine** 
    * Plotting a single graph
    * Plotting multiple lines
    * Subplotting
5. **Practice problems**





## 1. Basics

Python is an object-oriented general-purpose programming language. It is high-level and presents a relatively simple syntax. The offical documentation can be found here: https://docs.python.org/3/. In particular they offer a [tutoral](https://docs.python.org/3/tutorial/index.html) which is useful for those who are already familar with other programming languages (you should be!).

In this course you should use [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb), which is a free cloud service that offers a jupyter notebook environment (which is executed on Google's cloud servers) **AND** a GPU! 

### 1.1 Basic data types 



**Numbers**

In [None]:
x = 3
print(type(x)) # Prints "<class 'int'>"
y = 2.5
print(type(y)) # Prints "<class 'float'>"

**Booleans**

In [None]:
t = True
f = False
print(type(t)) # Prints "<class 'bool'>"
print(t and f) # Logical AND; prints "False"
print(t or f)  # Logical OR; prints "True"
print(not t)   # Logical NOT; prints "False"
print(t != f)  # Logical XOR; prints "True"

**Strings**

In [None]:
hello = 'hello'    # String literals can use single quotes
world = "world"    # or double quotes; it does not matter.
print(hello)       # Prints "hello"
print(len(hello))  # String length; prints "5"
hw = hello + ' ' + world  # String concatenation
print(hw)  # prints "hello world"

In [None]:
s = "hello"
print(s.capitalize())  # Capitalize a string; prints "Hello"
print(s.upper())       # Convert a string to uppercase; prints "HELLO"
print(s.replace('l', '(ell)'))  # Replace all instances of one substring with another;                             

### 1.2 Conainers

Python includes several built-in container types: lists, dictionaries and sets.

#### 1.2.1 Lists

A list is the Python equivalent of an array, but is resizeable and can contain elements of different types:

In [None]:
xs = [3, 1, 2]    # Create a list
print(xs, xs[2])  # Prints "[3, 1, 2] 2"
print(xs[-1])     # Negative indices count from the end of the list; prints "2"
xs[2] = 'foo'     # Lists can contain elements of different types
print(xs)         # Prints "[3, 1, 'foo']"
xs.append('bar')  # Add a new element to the end of the list
print(xs)         # Prints "[3, 1, 'foo', 'bar']"
x = xs.pop()      # Remove and return the last element of the list
print(x, xs)      # Prints "bar [3, 1, 'foo']"

**Slicing**

In [None]:
nums = list(range(5))     # range is a built-in function that creates a list of integers
print(nums)               # Prints "[0, 1, 2, 3, 4]"
print(nums[2:4])          # Get a slice from index 2 to 4 (exclusive); prints "[2, 3]"
print(nums[2:])           # Get a slice from index 2 to the end; prints "[2, 3, 4]"
print(nums[:])            # Get a slice of the whole list; prints "[0, 1, 2, 3, 4]"
print(nums[:-1])          # Slice indices can be negative; prints "[0, 1, 2, 3]"
nums[2:4] = [8, 9]        # Assign a new sublist to a slice
print(nums)               # Prints "[0, 1, 8, 9, 4]"

**Loops**

In [None]:
animals = ['cat', 'dog', 'monkey']
for animal in animals:
    print(animal)
# Prints "cat", "dog", "monkey", each on its own line.

#### 1.2.2 Dictionaries

A dictionary stores (key, value) pairs

In [None]:
d = {'cat': 'cute', 'dog': 'furry'}  # Create a new dictionary with some data
print(d['cat'])       # Get an entry from a dictionary; prints "cute"
d['fish'] = 'wet'     # Set an entry in a dictionary
print(d['fish'])      # Prints "wet"
# print(d['monkey'])  # KeyError: 'monkey' not a key of d
print(d.get('fish'))    # Get an element with a default; prints "wet"
del d['fish']         # Remove an element from a dictionary

#### 1.2.3 Sets

A set is an unordered collection of distinct elements

In [None]:
animals = {'cat', 'dog'}
print('cat' in animals)   # Check if an element is in a set; prints "True"
print('fish' in animals)  # prints "False"
animals.add('fish')       # Add an element to a set
print('fish' in animals)  # Prints "True"
print(len(animals))       # Number of elements in a set; prints "3"
animals.add('cat')        # Adding an element that is already in the set does nothing
print(len(animals))       # Prints "3"
animals.remove('cat')     # Remove an element from a set
print(len(animals))       # Prints "2"

### 1.3 Functions

In [None]:
def sign(x):
    if x > 0:
        return 'positive'
    elif x < 0:
        return 'negative'
    else:
        return 'zero'

for x in [-1, 0, 1]:
    print(sign(x))
# Prints "negative", "zero", "positive"

## 2. Numpy

Numpy is the defacto library for computing with n-dimentional arrays (e.g. vector and matrices) in Python. You will typically see it refered in code as `np`. You can find the documentation at: https://numpy.org

In [None]:
import numpy as np

### 2.1 Arrays


Using numpy, we can create arrays and access their elements

In [None]:
a = np.array([1, 2, 3])   # Create a rank 1 array
print(type(a))            # Prints "<class 'numpy.ndarray'>"
print(a.shape)            # Prints "(3,)"
print(a[0], a[1], a[2])   # Prints "1 2 3"
a[0] = 5                  # Change an element of the array
print(a)                  # Prints "[5, 2, 3]"

b = np.array([[1,2,3],[4,5,6]])    # Create a rank 2 array
print(b.shape)                     # Prints "(2, 3)"
print(b[0, 0], b[0, 1], b[1, 0])   # Prints "1 2 4"

There are also functions for creating the arrays

In [None]:
a = np.zeros((2,2))   # Create an array of all zeros
print(a)              # Prints "[[ 0.  0.]
                      #          [ 0.  0.]]"

b = np.ones((1,2))    # Create an array of all ones
print(b)              # Prints "[[ 1.  1.]]"

c = np.full((2,2), 7)  # Create a constant array
print(c)               # Prints "[[ 7.  7.]
                       #          [ 7.  7.]]"

d = np.eye(2)         # Create a 2x2 identity matrix
print(d)              # Prints "[[ 1.  0.]
                      #          [ 0.  1.]]"

e = np.random.random((2,2))  # Create an array filled with random values
print(e)                     # Might print "[[ 0.91940167  0.08143941]
                             #               [ 0.68744134  0.87236687]]"

### 2.1 Array indexing


Slicing is similar to python lists

In [None]:
# Create the following rank 2 array with shape (3, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])

# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]

# A slice of an array is a view into the same data, so modifying it
# will modify the original array.
print(a[0, 1])   # Prints "2"
b[0, 0] = 77     # b[0, 0] is the same piece of data as a[0, 1]
print(a[0, 1])   # Prints "77"

Boolean array indexing

In [None]:
a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a > 2)   # Find the elements of a that are bigger than 2;
                     # this returns a numpy array of Booleans of the same
                     # shape as a, where each slot of bool_idx tells
                     # whether that element of a is > 2.

print(bool_idx)      # Prints "[[False False]
                     #          [ True  True]
                     #          [ True  True]]"

# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])  # Prints "[3 4 5 6]"

# We can do all of the above in a single concise statement:
print(a[a > 2])     # Prints "[3 4 5 6]"

### 2.1 Datatypes


In [None]:
x = np.array([1, 2])   # Let numpy choose the datatype
print(x.dtype)         # Prints "int64"

x = np.array([1.0, 2.0])   # Let numpy choose the datatype
print(x.dtype)             # Prints "float64"

x = np.array([1, 2], dtype=np.int64)   # Force a particular datatype
print(x.dtype)                         # Prints "int64"

### 2.1 Array math


Basic mathematical functions operate elementwise on arrays

In [None]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x, y))

# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))

# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))

# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print(x / y)
print(np.divide(x, y))

# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

Note that unlike MATLAB, * is elementwise multiplication, not matrix multiplication. We instead use the dot function to compute inner products of vectors, to multiply a vector by a matrix, and to multiply matrices. dot is available both as a function in the numpy module and as an instance method of array objects:

In [None]:
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

v = np.array([9,10])
w = np.array([11, 12])

# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

# Matrix / vector product; both produce the rank 1 array [29 67]
print(x.dot(v))
print(np.dot(x, v))

# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
#  [43 50]]
print(x.dot(y))
print(np.dot(x, y))

Numpy provides many useful functions for performing computations on arrays; one of the most useful is sum:

In [None]:
x = np.array([[1,2],[3,4]])

print(np.sum(x))  # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"

Apart from computing mathematical functions using arrays, we frequently need to reshape or otherwise manipulate data in arrays. The simplest example of this type of operation is transposing a matrix; to transpose a matrix, simply use the T attribute of an array object:

In [None]:
x = np.array([[1,2], [3,4]])
print(x)    # Prints "[[1 2]
            #          [3 4]]"
print(x.T)  # Prints "[[1 3]
            #          [2 4]]"

# Note that taking the transpose of a rank 1 array does nothing:
v = np.array([1,2,3])
print(v)    # Prints "[1 2 3]"
print(v.T)  # Prints "[1 2 3]"

# 3. Pandas

[Pandas](https://pandas.pydata.org/docs/) is a library for storing and manipulating tabular data. Such table is called a dataframe. As you will learn in this course, most data can be represented in tabular form and pandas is therefore quite useful. Additionally, your results will often in the form of tables for this we encourage you to use pandas. This section will give a breif introduction to pandas. We recommend you look at the official _getting started_ tutorial: https://pandas.pydata.org/docs/getting_started/index.html#getting-started.

Documentation: https://pandas.pydata.org/docs/

Typically `pandas` is imported as `pd`:

In [None]:
import pandas as pd

## 3.1 Creating a DataFrame

You can construct a dataframe with `pd.DataFrame` (there are more ways). Oversimplified, this takes a dictionary type that maps columns to numpy vectors or lists.

In [None]:
df = pd.DataFrame({
    'name': [
        'Snow White and the Seven Dwarfs', 
        'Pinocchio',
        'Fantasia',
        'The Reluctant Dragon',
        'Dumbo',
        'Bambi'
    ],
    'year': np.asarray([1937, 1940, 1940, 1941, 1941, 1942])
})
print(df)

It is possible to display pandas dataframes a more pretty format within a notebook:

In [None]:
from IPython.display import display
display(df)

## 3.2 Dataframe subsets

There are many ways to extract or update a subset of a dataframe. Often you will see `df['name']` but you properly want to avoid this method, as it's behavior is not consistent for updating a dataframe. Instead, use `df.loc[]` or `df.iloc[]`. The former takes names while the latter takes indices.

In [None]:
df.loc[:, 'name']  # Extract a specific column

In [None]:
df.loc[1:2, 'name']  # Extract a specific column for a specific row range

In [None]:
df.loc[1:1, 'name'] = 'Pinocchio (There are not strings on me)'
df

In [None]:
df.loc[df.loc[:, 'year'] <= 1940, :] # Extract a all columns, but specify rows with a boolean array

In [None]:
df.iloc[:, 1]  # Extract a specific column, but the column is specified using an index

## 3.3 Aggregation

Using `.groupby()` is is possible to group a dataframe into smaller dataframes, then using `.apply()` a function can map each group with a custom function. This way you can aggregate any dataframe as you choose.

In [None]:
df.groupby(['year']) \
  .apply(lambda group_df: pd.Series({ 'num movies': group_df.shape[0] }))

Notice how the _year_ and _num movies_ are on different lines. That is because `year` is now an index column. The concept of indexes is a bit too advanced for what we will need. Generally, in this course, you will be better of avoiding advanced indexing. You can remove the index with `.reset_index()`.

In [None]:
df.groupby(['year']) \
  .apply(lambda group_df: pd.Series({ 'num movies': group_df.shape[0] })) \
  .reset_index()

## Wide and long format

When dealing with tabular data, it is important to understand the concept of _wide format_ and _long format_. While this can be challenging in the beginning it will become very useful. Consider the follow dataframe that is a _wide format_:

In [None]:
df_wave = pd.DataFrame({
    'x': np.linspace(0, 2*np.pi, 100),
    'cos(x)': np.cos(np.linspace(0, 2*np.pi, 100)),
    'sin(x)': np.sin(np.linspace(0, 2*np.pi, 100)),
    'cos(2x)': np.cos(2*np.linspace(0, 2*np.pi, 100)),
    'sin(2x)': np.sin(2*np.linspace(0, 2*np.pi, 100))
})
display(df_wave)

The above dataframe is a _wide format_ because `cos(x)`, `sin(x)`,`cos(2x)`, and `sin(2x)` exists as column. The _long format_ alternative to this is to have one column indicate which waveform (column) it is. _wide format_  can be transformed to _long format_  by using `.melt()`:

In [None]:
df_wave.melt(id_vars=['x'], value_vars=['cos(x)', 'sin(x)', 'cos(2x)', 'sin(2x)'], var_name='fn', value_name='output')

The _long format_ can be very useful. For example, if we wanted to aggregate each column in the _wide format_ that would require us to apply a function to each column. However, in the _long format_ we can simply use `.gropyby(['fn'])`.

## 4. Plotnine

Ploting in python is commonly done with `matplotlib`. However, `matplotlib` should not be used directly. Instead, an abstraction like `seaborn` or `plotnine` should be used. In this course we advocate for [`plotnine`](https://plotnine.readthedocs.io/en/stable/). 

Plotnine is based on a so called _grammar of graphics_ and is essentially a python version of _ggplot2_ in R (a programming language). Often, if you can't find appropriate examples for plotnine you can easily find the examples for _ggplot2_.

While you will often seen `import * from plotnine` is often best to avoid this and do `import plotnine as p9`.

In [None]:
%matplotlib inline
from IPython.display import display
import plotnine as p9

## 4.1 Prepearing the data

Plotnine uses dataframes from pandas and you typically want it in _long format_. Let's prepear our data:

In [None]:
df_wave = pd.DataFrame({
    'x': np.linspace(0, 2*np.pi, 100),
    'cos(1x)': np.cos(np.linspace(0, 2*np.pi, 100)),
    'sin(1x)': np.sin(np.linspace(0, 2*np.pi, 100)),
    'cos(2x)': np.cos(2*np.linspace(0, 2*np.pi, 100)),
    'sin(2x)': np.sin(2*np.linspace(0, 2*np.pi, 100)),
    'cos(4x)': np.cos(4*np.linspace(0, 4*np.pi, 100)),
    'sin(4x)': np.sin(4*np.linspace(0, 4*np.pi, 100)),
    'cos(8x)': np.cos(4*np.linspace(0, 8*np.pi, 100)),
    'sin(8x)': np.sin(4*np.linspace(0, 8*np.pi, 100)),
})
df_wave_long = df_wave.melt(id_vars=['x'], var_name='wave')
df_wave_long.loc[:, 'type'] = df_wave_long.loc[:, 'wave'].str.slice(0, 3)
df_wave_long.loc[:, 'wavenumber'] = df_wave_long.loc[:, 'wave'].str.slice(4, -2)
display(df_wave_long)

### 4.1 Plotting a single curve

It is often useful to first plot a single curve, and then gradually increase the complexity of your plot. If you run in to issues always try to simplify first. 

You start a plotting gramma by using `p9.ggplot(dataframe)`. This instructs plotnine what data to plot. Everything after is about how to plot it. In this case we write `p9.geom_point(p9.aes(x='x', y='value'))`, this tells plotnine to use `x` as the x-axis and `value` as the y-axis.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wave'] == 'cos(1x)', :]
p = (p9.ggplot(df_wave_subset) +
     p9.geom_point(p9.aes(x='x', y='value'))
)
print(p)

Unlike, if you had just used matplotlib, you get a lot of things for free in `plotnine`. Such as the x and y-axis labels. However we can still specify these things. Notice that multiple grammatical instructions are combined using the `+` operator.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wave'] == 'cos(1x)', :]
p = (p9.ggplot(df_wave_subset) +
     p9.geom_point(p9.aes(x='x', y='value')) +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π']) +
     p9.scale_y_continuous(name='y')
)
print(p)

If we want to use a line instead of points, we can simply use `geom_line` instead of `geom_point`.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wave'] == 'cos(1x)', :]
p = (p9.ggplot(df_wave_subset) +
     p9.geom_line(p9.aes(x='x', y='value')) +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π']) +
     p9.scale_y_continuous(name='y')
)
print(p)

Because the grammatical instructions are additive, if we want both lines and points we can simply use both `gemo_line` and `geom_point`:

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wave'] == 'cos(1x)', :]
p = (p9.ggplot(df_wave_subset) +
     p9.geom_line(p9.aes(x='x', y='value')) +
     p9.geom_point(p9.aes(x='x', y='value')) +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π']) +
     p9.scale_y_continuous(name='y')
)
print(p)

It is a little redundant to keep copying `p9.aes(x='x', y='value')`. If you want aesthetics to be shared among all `geom_*` instructions then you can simply add the information to `p9.ggplot` instead.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wave'] == 'cos(1x)', :]
p = (p9.ggplot(df_wave_subset, p9.aes(x='x', y='value')) +
     p9.geom_line() +
     p9.geom_point() +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π']) +
     p9.scale_y_continuous(name='y')
)
print(p)

## 4.3 Multiple lines and legends

The graphical gramma makes it easy to show multiple lines/points in different colors or styles. You do this by simply including more data in the dataframe and adding additional aesthetics instructions.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wavenumber'] == '1', :]
p = (p9.ggplot(df_wave_subset, p9.aes(x='x', y='value', color='wave')) +
     p9.geom_line() +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

Notice that the legend is automatically added. This is a major advantage over matplotlib, which is why we encourage you to use plotnine. However, if you forget to include the `color` aesthetic instruction it will look confusing because all the data is treated as a single line. 

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wavenumber'] == '1', :]
p = (p9.ggplot(df_wave_subset, p9.aes(x='x', y='value')) +
     p9.geom_line() +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

It is also possible to use linetype instead of colors.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wavenumber'] == '1', :]
p = (p9.ggplot(df_wave_subset, p9.aes(x='x', y='value', linetype='wave')) +
     p9.geom_line() +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

You can even use both linetype and color. This is an advantage for those who are colorblind. Again, notice that the correct lenged is automatically generated.

In [None]:
df_wave_subset = df_wave_long.loc[df_wave_long['wavenumber'] == '1', :]
p = (p9.ggplot(df_wave_subset, p9.aes(x='x', y='value', color='wave', linetype='wave')) +
     p9.geom_line() +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

## 4.4 Subplotting (facets)

When we add more data it can get confusing.

In [None]:
p = (p9.ggplot(df_wave_long, p9.aes(x='x', y='value', color='wave')) +
     p9.geom_line() +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

To reduce the confusing we can use facets (this is known as subplots in classical matplotlib terminology). This is done with `facet_grid` or `facet_wrap`. Notice how the x- and y-axis are shared among the subplots. You can set `scales='free'` to avoid this behavior, but a shared axis is typically desired. 

In [None]:
p = (p9.ggplot(df_wave_long, p9.aes(x='x', y='value', color='wave')) +
     p9.geom_line() +
     p9.facet_wrap('wavenumber') +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

`facet_wrap` will make the subplots folow similar to text (this is called wrapping). You can specify the number of columns or rows with `nrow` or `ncol`.

In [None]:
p = (p9.ggplot(df_wave_long, p9.aes(x='x', y='value', color='wave')) +
     p9.geom_line() +
     p9.facet_wrap('wavenumber', ncol=3) +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

Alternatively, `facet_grid` allows you to specify two axes of subplots.

In [None]:
p = (p9.ggplot(df_wave_long, p9.aes(x='x', y='value', color='wave')) +
     p9.geom_line() +
     p9.facet_grid('type ~ wavenumber') +
     p9.scale_x_continuous(name='x', breaks=[0, np.pi, 2*np.pi], labels=['0', 'π', '2π'])
)
print(p)

## 5. Practice Problems

Use these problems to test your abilities.

## 5.1 Problem 1 - Basic

Determine if the list is monotonic, return the answer as a boolean.

In [None]:
def problem_1(data):
    """Tests if the data is monotonically increasing or decreasing
    
    Args:
        data (List[int]): List of integers

    Returns: True if data is monotoically increasing or decreasing, false otherwise
    """

    ...

In [None]:
grader.check("question 1")

## 5.2 Problem 2 - numpy

Return the maximum number and the index of that number from an array of numbers.

In [None]:
def problem_2(data):
    """Identify the maximum value and its index
    
    Args:
        data (np.ndarray): Data as numpy array, has 1 dimention

    Returns:
        The maximum value,
        The index of the maximum value
    """

    ...

In [None]:
grader.check("question 2")

## 5.3 Problem 3 - pandas

This is a table of the top 20 most populated cities.

In [None]:
df_city_pop = pd.DataFrame([
    { 'city': 'Tokyo', 'country': 'Japan', '2022': 37_274_000, '2021': 37_339_804 },
    { 'city': 'Delhi', 'country': 'India', '2022': 32_065_760, '2021': 31_181_376 },
    { 'city': 'Shanghai', 'country': 'China', '2022': 28_516_904, '2021': 27_795_702 },
    { 'city': 'Dhaka', 'country': 'Bangladesh', '2022': 22_478_116, '2021': 21_741_090 },
    { 'city': 'Sao Paulo', 'country': 'Brazil', '2022': 22_429_800, '2021': 22_237_472 },
    { 'city': 'Mexico City', 'country': 'Mexico', '2022': 22_085_140, '2021': 21_918_936 },
    { 'city': 'Cairo', 'country': 'Egypt', '2022': 21_750_020, '2021': 21_322_750 },
    { 'city': 'Beijing', 'country': 'China', '2022': 21_333_332, '2021': 20_896_820 },
    { 'city': 'Mumbai', 'country': 'India', '2022': 20_961_472, '2021': 20_667_656 },
    { 'city': 'Osaka', 'country': 'Japan', '2022': 19_059_856, '2021': 19_110_616 },
    { 'city': 'Chongqing', 'country': 'China', '2022': 16_874_740, '2021': 16_382_376 },
    { 'city': 'Karachi', 'country': 'Pakistan', '2022': 16_839_950, '2021': 16_459_472 },
    { 'city': 'Istanbul', 'country': 'Turkey', '2022': 15_636_243, '2021': 15_415_197 },
    { 'city': 'Kinshasa	DR', 'country': 'Congo', '2022': 15_628_085, '2021': 14_970_460 },
    { 'city': 'Lagos', 'country': 'Nigeria', '2022': 15_387_639, '2021': 14_862_111 },
    { 'city': 'Buenos Aires', 'country': 'Argentina', '2022': 15_369_919, '2021': 15_257_673 },
    { 'city': 'Kolkata', 'country': 'India', '2022': 15_133_888, '2021': 14_974_073 },
    { 'city': 'Manila', 'country': 'Philippines', '2022': 14_406_059, '2021': 14_158_573 },
    { 'city': 'Tianjin', 'country': 'China', '2022': 14_011_828, '2021': 13_794_450 },
    { 'city': 'Guangzhou', 'country': 'China', '2022': 13_964_637, '2021': 13_635_397 },
])
display(df_city_pop)

### 5.3.a - long format

In [None]:
def problem_3a(df):
    """Converts the table to long-format, such the columns are city, country, year, and population 

    Args:
        df (pd.DataFrame): DataFrame with the columns city, conuntry, 2022, and 2021. Years refers to the population
    
    Returns:
        pd.DataFrame: DataFrame with the columns city, country, year, and population
    """

    ...

In [None]:
grader.check("question 3a")

### 4.3.b - aggregation

In [None]:
def problem_3b(df):
    """Sum the population for each country and year, return in long format

    Hint: Expand on the solution from problem_3a

    Args:
        df (pd.DataFrame): DataFrame with the columns city, conuntry, 2022, and 2021. Years refers to the population
    
    Returns:
        pd.DataFrame: DataFrame with the columns country, year, and population
    """

    ...

In [None]:
grader.check("question 3b")

### 4.3.c - Long to wide

In [None]:
def problem_3c(df):
    """Sum the population for each country and year, return in wide format 

    Hint: Expand on the solution from problem_3b
    Hint: Look at pandas.DataFrame.pivot

    Args:
        df (pd.DataFrame): DataFrame with the columns city, conuntry, 2022, and 2021. Years refers to the population
    
    Returns:
        pd.DataFrame: DataFrame with the columns country, 2022, 2021
    """

    ...

In [None]:
grader.check("question 3c")

## 4.4 - plotting

In [None]:
def iris_dataset():
    from sklearn.datasets import load_iris
    d = load_iris(as_frame=True)
    return d['data'].assign(variant = d.target_names[d['target']])

display(iris_dataset())

<!-- BEGIN QUESTION -->

### question 4.4.a

Plot the sepal width on the x-axis and the petal width on the y-axis, color the dots in a scatter plot using the variant columns.

In [None]:
df = iris_dataset()
...
print(p)

Your plot should look like this:

<img src="https://drive.google.com/uc?id=1FGEDO9IdaG1VyDqrPrMEnMqq10hyTCVY">

<!-- END QUESTION -->

<!-- BEGIN QUESTION -->

### question 4.4.b

Plot the equation $sin(x)*cos(y)*x^2$ form 0 to $2 \pi$ for both x and y, as a heatmap. Hint use the method `geom_raster`.

In [None]:
x_44b, y_44b = np.meshgrid(np.linspace(0,2*np.pi), np.linspace(0,2*np.pi))
df_44b = pd.DataFrame({
    'x': x_44b.ravel(),
    'y': y_44b.ravel(),
    'z': np.sin(x_44b.ravel()) * np.cos(y_44b.ravel()) * x_44b.ravel()**2
})

In [None]:
...
print(p)

Your plot should look like this:

<img src="https://drive.google.com/uc?id=1fGjuCg4JkoihXP18dXxIx32O2c0V9y4B">

<!-- END QUESTION -->

