## Numpy

### NumPy is a Python library used for working with arrays and that also has multiple statistical built-in function
### It is always import as np


In [39]:
# Make your imports
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

<details><summary markdown='span'>View solution</summary>

```python
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
```
</details>

### Create a numpy array using the given list


In [3]:
arr = [1, 2, 3, 4, 5, 6]
np_arr = np.array([1, 2, 3, 4, 5, 6])
type(np_arr)

# Use a list containing strings and a list containing different data types
np_arr_str = np.array(["str1","str2","str3"])
np_arr_random = np.array([1,"str",True])

<details><summary markdown='span'>View solution</summary>

```python
np_arr = np.array([1, 2, 5, 7, 8, 10])
type(np_arr)

# Such as in a list, you can store multiple types of data in a numpy array (str, int, float, bool, object, ...)
np_arr_str = np.array(["I", "Love", "Numpy", "Arrays"])
np_arr_random = np.array(["I", 5, True, 1.6])
```
</details>

### You can also create n-D arrays (matrix)

In [24]:
arr_2d = [[1, 2, 3], [4, 5, 6]]
np_arr_2D = np.array([[1,2,3],[4,5,6]])
np_arr_2D.shape

(2, 3)

<details><summary markdown='span'>View solution</summary>

```python
np_arr_2D = np.array([[1, 2, 3], [4, 5, 6]])
```
</details>

### Create a numpy array using a DataFrame column

In [25]:
# Import the data in a pandas dataFrame and retrieve the Weekly_Sales column in a variable
df = pd.read_csv("data/data.csv")
print(df)
weekly_sales = df["Weekly_Sales"]


        Store  Dept        Date  Weekly_Sales  IsHoliday
0           1     1  05/02/2010      24924.50      False
1           1     1  12/02/2010      46039.49       True
2           1     1  19/02/2010      41595.55      False
3           1     1  26/02/2010      19403.54      False
4           1     1  05/03/2010      21827.90      False
...       ...   ...         ...           ...        ...
421565     45    98  28/09/2012        508.37      False
421566     45    98  05/10/2012        628.10      False
421567     45    98  12/10/2012       1061.02      False
421568     45    98  19/10/2012        760.01      False
421569     45    98  26/10/2012       1076.80      False

[421570 rows x 5 columns]


<details><summary markdown='span'>View solution</summary>

```python
df = pd.read_csv("data/data.csv")
weekly_sales = df["Weekly_Sales"]
```
</details>

In [26]:
# Creation of the np array
np_ws = df["Weekly_Sales"]
np_ws

0         24924.50
1         46039.49
2         41595.55
3         19403.54
4         21827.90
            ...   
421565      508.37
421566      628.10
421567     1061.02
421568      760.01
421569     1076.80
Name: Weekly_Sales, Length: 421570, dtype: float64

<details><summary markdown='span'>View solution</summary>

```python
np_ws = np.array(weekly_sales)
np_ws
```
</details>

In [27]:
# print out the shape of the numpy array
np_ws.shape

(421570,)

In [36]:
# Retrieve the first and the last element using indexing (same way than in a classic list)
first_elem = np_ws[0]
last_elem = np_ws[1]

# Retrieve the first half of the array
first_half = np_ws[len(np_ws)/2]


210785


<details><summary markdown='span'>View solution</summary>

```python
# Retrieve the first and the last element using indexing (same way than in a classic list)
first_elem = np_ws[0]
last_elem = np_ws[-1]

# Retrieve the first half of the array
first_half = np_ws[:len(np_ws)//2]
```
</details>

### Reshaping: Convert 1-D array into a 2-D array
#### Use the [reshape](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.reshape.html) method to convert our np list into a 2-D array

In [40]:
np_arr_3D = np_ws.reshape(2,len(np_ws)//2)
np_arr_3D.shape

AttributeError: 'Series' object has no attribute 'reshape'

<details><summary markdown='span'>View solution</summary>

```python
np_arr_3D = np_ws.reshape(2, len(np_ws)//2)
np_arr_3D.shape
```
</details>

#### Using reshape and broadcasting to multiply the following arrays together

In [51]:
import numpy as np
arr1 = np.array([[ 0.0,  0.0,  0.0],
                 [10.0, 10.0, 10.0],
                 [20.0, 20.0, 20.0],
                 [30.0, 30.0, 30.0]])
arr2 = np.array([1.0, 1.0, 3.0, 4.0])

print(arr1.shape)
print(arr2.shape)
print(len(arr2))
arr2=arr2.reshape(len(arr2),1)
print(arr2.shape)
# modify arr2 there

arr1 * arr2  # Currently returns ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

(4, 3)
(4,)
4
(4, 1)


array([[  0.,   0.,   0.],
       [ 10.,  10.,  10.],
       [ 60.,  45.,  60.],
       [120., 120., 120.]])

<details><summary markdown='span'>View solution</summary>

```python
arr1 = np.array([[ 0.0,  0.0,  0.0],
                 [10.0, 10.0, 10.0],
                 [20.0, 20.0, 20.0],
                 [30.0, 30.0, 30.0]])
arr2 = np.array([1.0, 2.0, 3.0, 4.0])

arr2 = arr2.reshape(len(arr2),1)
arr1 * arr2 
```
</details>

### Where: The search method of numpy arrays
#### [where](https://numpy.org/doc/stable/reference/generated/numpy.where.html) allows you to use masks (filter) to locate specific element in a numpy array

In [53]:
# Find all sales higher than 20 000
#filtered_arr = 
np_ws[np.where(np_ws > 20000)]

KeyError: 'key of type tuple not found and not a MultiIndex'

<details><summary markdown='span'>View solution</summary>

```python
# Find all sales higher than 20 000
np_ws[np.where(np_ws > 20000)]
```
</details>

### Vectorization: One of the main strength of numpy arrays
#### When numpy arrays are used, you can make multiple arithmetic operations on each values in one single line

In [54]:
# Try to add the np array with itself
add_np_arr = np_ws + np_ws
# Try to substract
sub_np_arr = np_ws - np_ws

# Divide
div_np_arr = np_ws - np_ws

# Multiply
mul_np_arr = np_ws * np_ws

<details><summary markdown='span'>View solution</summary>

```python
# Try to add the np array with itself
add_np_arr = np_ws + np_ws

# Try to substract
sub_np_arr = np_ws - np_ws

# Divide
div_np_arr = np_ws / np_ws

# Multiply
mul_np_arr = np_ws * np_ws
```
</details>

### You can also make power and modulo


In [55]:
# make power of 2 of the list
pow_2_np_arr = np_ws ** 2

# Make modulo 4 of all elements in the np array
mod_np_arr = np_ws % 4

<details><summary markdown='span'>View solution</summary>

```python
# make power of 2 of the list
pow_2_np_arr = np_ws ** 2

# Make modulo 4 of all elements in the np array
mod_np_arr = np_ws % 4
```
</details>

#### There are multiple other function available in the numpy module (unique, sum, round, logs, ...)

## Statistics
#### Such as in excel, you can use numpy to determine multiple helpful statistical values. Calculate the asked statistics of the weekly sale array. Check out [Numpy stats page](https://numpy.org/doc/stable/reference/routines.statistics.html) for the documentation

We will ask you to first implement some basic statistics function, then we will compare them to their numpy version

In [56]:
def mean(arr):
    return sum(arr) / len(arr)
    pass

round(mean(np_ws), 1) == round(np.mean(np_ws), 1)

True

<details><summary markdown='span'>View solution</summary>

```python
def mean(arr):
    return float(sum(arr)) / len(arr)

round(mean(np_ws), 1) == round(np.mean(np_ws), 1)
```
</details>

In [57]:
def var(arr):
    x = abs(arr - arr.mean())**2
    return mean(x)
    pass

round(var(np_ws), 1) == round(np.var(np_ws), 1)

True

<details><summary markdown='span'>View solution</summary>

```python
def var(arr):
    x = abs(arr - arr.mean())**2
    return mean(x)

round(var(np_ws), 1) == round(np.var(np_ws), 1)
```
</details>

In [59]:
def std(arr):
    return np.sqrt(var(arr))
    pass

round(std(np_ws), 1) == round(np.std(np_ws), 1)

True

<details><summary markdown='span'>View solution</summary>

```python
def std(arr):
    return np.sqrt(var(arr))

round(std(np_ws), 1) == round(np.std(np_ws), 1)
```
</details>

In [58]:
def median(arr):
    return sorted(arr)[(len(arr) + 1) // 2]
    pass

round(median(np_ws), 1) == round(np.median(np_ws), 1)

True

<details><summary markdown='span'>View solution</summary>

```python
def median(arr):
    return sorted(arr)[(len(arr) + 1) // 2]

round(median(np_ws), 1) == round(np.median(np_ws), 1)
```
</details>

In [60]:
def max_val(arr):
    actual = -1
    for element in arr:
        if element > actual:
            actual = element
    return actual
    pass

round(max_val(np_ws), 1) == round(np.max(np_ws), 1)

True

<details><summary markdown='span'>View solution</summary>

```python
def max_val(arr):
    actual = -1
    for element in arr:
        if element > actual:
            actual = element
    return actual

round(max_val(np_ws), 1) == round(np.max(np_ws), 1)
```
</details>

There are a lot of other Statistics methods in numpy, try to find them all in the documentation !

In [None]:
median_np = ...
max_np = ...
min_np = ...
range_np = ...
quantile_25_np = ...
quantile_75_np = ...

<details><summary markdown='span'>View solution</summary>

```python
mean_np = np.mean(np_ws)
var_np = np.var(np_ws)
std_np = np.std(np_ws)
median_np = np.median(np_ws)
max_np = np.max(np_ws)
min_np = np.min(np_ws)
range_np = np.ptp(np_ws)
quantile_25_np = np.quantile(np_ws, 0.25)
quantile_75_np = np.quantile(np_ws, 0.75)
```
</details>