In [1]:
%matplotlib inline
#%load_ext fortranmagic

import sys; sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('figure', figsize=(12, 7))

ran_the_first_cell = True

jan2017 = pd.to_datetime(['2017-01-03 00:00:00+00:00',
 '2017-01-04 00:00:00+00:00',
 '2017-01-05 00:00:00+00:00',
 '2017-01-06 00:00:00+00:00',
 '2017-01-09 00:00:00+00:00',
 '2017-01-10 00:00:00+00:00',
 '2017-01-11 00:00:00+00:00',
 '2017-01-12 00:00:00+00:00',
 '2017-01-13 00:00:00+00:00',
 '2017-01-17 00:00:00+00:00',
 '2017-01-18 00:00:00+00:00',
 '2017-01-19 00:00:00+00:00',
 '2017-01-20 00:00:00+00:00',
 '2017-01-23 00:00:00+00:00',
 '2017-01-24 00:00:00+00:00',
 '2017-01-25 00:00:00+00:00',
 '2017-01-26 00:00:00+00:00',
 '2017-01-27 00:00:00+00:00',
 '2017-01-30 00:00:00+00:00',
 '2017-01-31 00:00:00+00:00',
 '2017-02-01 00:00:00+00:00'])
calendar = jan2017.values.astype('datetime64[D]')

event_dates = pd.to_datetime(['2017-01-06 00:00:00+00:00', 
                             '2017-01-07 00:00:00+00:00', 
                             '2017-01-08 00:00:00+00:00']).values.astype('datetime64[D]')
event_values = np.array([10, 15, 20])

<center>
  <h1>The PyData Toolbox</h1>
  <h3>Scott Sanderson (Twitter: @scottbsanderson, GitHub: ssanderson)</h3>
  <h3><a href="https://github.com/ssanderson/pydata-toolbox">https://github.com/ssanderson/pydata-toolbox</a></h3>
</center>

# About Me:

<img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/me.jpg?raw=1" alt="Drawing" style="width: 300px;"/>

- Senior Engineer at [Quantopian](www.quantopian.com)
- Background in Mathematics and Philosophy
- **Twitter:** [@scottbsanderson](https://twitter.com/scottbsanderson)
- **GitHub:** [ssanderson](github.com/ssanderson)

## Outline

- Built-in Data Structures
- Numpy `array`
- Pandas `Series`/`DataFrame`
- Plotting and "Real-World" Analyses

# Data Structures

> Rule 5. Data dominates. If you've chosen the right data structures and organized things well, the algorithms
will almost always be self-evident. Data structures, not algorithms, are central to programming.

- *Notes on Programming in C*, by Rob Pike.

# Lists

In [2]:
assert ran_the_first_cell, "Oh noes!"

In [3]:
l = [1, 'two', 3.0, 4, 5.0, "six"]
l

[1, 'two', 3.0, 4, 5.0, 'six']

In [4]:
milista = [True, 4, "Luis Miguel",4.5]

In [5]:
# Lists can be indexed like C-style arrays.
first = l[0]
second = l[1]
print("first:", first)
print("second:", second)

first: 1
second: two


In [6]:
primero = milista[0]
segundo = milista[1]
print("primero: ",primero)
print("segundo: ",segundo)

primero:  True
segundo:  4


In [7]:
# Negative indexing gives elements relative to the end of the list.
last = l[-1]
penultimate = l[-2]
print("last:", last)
print("second to last:", penultimate)

last: six
second to last: 5.0


In [8]:
ultimo = milista[-1]
antepenultimo = milista[-3]
print("ultimo: ", ultimo)
print("antepenultimo: ", antepenultimo)

ultimo:  4.5
antepenultimo:  4


In [9]:
# Lists can also be sliced, which makes a copy of elements between 
# start (inclusive) and stop (exclusive)
sublist = l[1:3]
sublist

['two', 3.0]

In [10]:
sublista = milista[1:4]
sublista

[4, 'Luis Miguel', 4.5]

In [11]:
# l[:N] is equivalent to l[0:N].
first_three = l[:3]
first_three

[1, 'two', 3.0]

In [12]:
primeros_dos = milista[:2]
primeros_dos

[True, 4]

In [13]:
# l[3:] is equivalent to l[3:len(l)].
after_three = l[3:]
after_three

[4, 5.0, 'six']

In [14]:
despues_de_dos = milista[2:]
despues_de_dos

['Luis Miguel', 4.5]

In [15]:
# There's also a third parameter, "step", which gets every Nth element.
l = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h']
l[1:7:2]

['b', 'd', 'f']

In [16]:
milista = [3,1,4,1,5,9,2,6,5,3,5,9]
milista[2:8:2]

[4, 5, 2]

In [17]:
# This is a cute way to reverse a list.
l[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

In [18]:
milista[::-1]

[9, 5, 3, 5, 6, 2, 9, 5, 1, 4, 1, 3]

In [19]:
# Lists can be grown efficiently (in O(1) amortized time).
l = [1, 2, 3, 4, 5]
print("Before:", l)
l.append('six')
print("After:", l)

Before: [1, 2, 3, 4, 5]
After: [1, 2, 3, 4, 5, 'six']


In [20]:
milista = [3,1,4,1,5,9,2,6,5,3,5,9]
print("Antes:", milista)
milista.append('pi')
print("Después:", milista)

Antes: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 9]
Después: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 9, 'pi']


In [21]:
# Comprehensions let us perform elementwise computations.
l = [1, 2, 3, 4, 5]
[x * 2 for x in l]

[2, 4, 6, 8, 10]

In [22]:
milista = ['l','u','i','s','m','i','g','u','e','l']
[x.upper() for x in milista]

['L', 'U', 'I', 'S', 'M', 'I', 'G', 'U', 'E', 'L']

## Review: Python Lists

- Zero-indexed sequence of arbitrary Python values.
- Slicing syntax: `l[start:stop:step]` copies elements at regular intervals from `start` to `stop`.
- Efficient (`O(1)`) appends and removes from end.
- Comprehension syntax: `[f(x) for x in l if cond(x)]`.

# Dictionaries

In [23]:
# Dictionaries are key-value mappings.
philosophers = {'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}
philosophers

{'Bertrand': 'Russell', 'David': 'Hume', 'Immanuel': 'Kant'}

In [24]:
cantantes = {'Santiago':'Cruz', 'Ricardo':'Montaner', 'Luis Miguel':'Gallego'}
cantantes

{'Luis Miguel': 'Gallego', 'Ricardo': 'Montaner', 'Santiago': 'Cruz'}

In [25]:
# Like lists, dictionaries are size-mutable.
philosophers['Ludwig'] = 'Wittgenstein'
philosophers

{'Bertrand': 'Russell',
 'David': 'Hume',
 'Immanuel': 'Kant',
 'Ludwig': 'Wittgenstein'}

In [26]:
cantantes['Natalia']='Jiménez'
cantantes

{'Luis Miguel': 'Gallego',
 'Natalia': 'Jiménez',
 'Ricardo': 'Montaner',
 'Santiago': 'Cruz'}

In [27]:
del philosophers['David']
philosophers

{'Bertrand': 'Russell', 'Immanuel': 'Kant', 'Ludwig': 'Wittgenstein'}

In [28]:
del cantantes['Santiago']
cantantes

{'Luis Miguel': 'Gallego', 'Natalia': 'Jiménez', 'Ricardo': 'Montaner'}

In [29]:
# No slicing.
#philosophers['Bertrand':'Immanuel']

In [30]:

#cantantes['Luis Miguel':'Natalia']

## Review: Python Dictionaries

- Unordered key-value mapping from (almost) arbitrary keys to arbitrary values.
- Efficient (`O(1)`) lookup, insertion, and deletion.
- No slicing (would require a notion of order).

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/pacino.gif?raw=1" alt="Drawing" style="width: 100%;"/></center>


In [31]:
# Suppose we have some matrices...
a = [[1, 2, 3],
     [2, 3, 4],
     [5, 6, 7],
     [1, 1, 1]]

b = [[1, 2, 3, 4],
     [2, 3, 4, 5]]

In [32]:
m1 = [[45,54],
      [36,27]]
m2 = [[25,45],
      [15,5]]

In [33]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

In [34]:
def multiplicarmatrices(M1,M2):
    
    m = len(M1)
    n = len(M2[0])
    producto = [[0 for col in range(n)] for row in range(m)]
    
    for i in range(m):
        for j in range(n):
            for k in range(len(M2)):
                producto[i][j] += M1[i][k] * M2[k][j]
    return producto

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/gross.gif?raw=1" alt="Drawing" style="width: 50%;"/></center>


In [35]:
%%time

matmul(a, b)

CPU times: user 31 µs, sys: 7 µs, total: 38 µs
Wall time: 42.4 µs


[[5, 8, 11, 14], [8, 13, 18, 23], [17, 28, 39, 50], [3, 5, 7, 9]]

In [36]:
%%time
multiplicarmatrices(m1,m2)

CPU times: user 24 µs, sys: 0 ns, total: 24 µs
Wall time: 28.4 µs


[[1935, 2295], [1305, 1755]]

In [37]:
import random
def random_matrix(m, n):
    out = []
    for row in range(m):
        out.append([random.random() for _ in range(n)])
    return out

randm = random_matrix(2, 3)
randm

[[0.333850251346936, 0.2371972339129761, 0.2396321677456289],
 [0.5230687464600385, 0.0006847988901972357, 0.9754356405651626]]

In [38]:
def matriz_aleatoria(m, n):
    matriz = []
    for fila in range(m):
        matriz.append([int(random.random()*100) for _ in range(n)])
    return matriz

randm = matriz_aleatoria(4, 3)
randm

[[91, 91, 99], [75, 30, 99], [73, 82, 97], [12, 55, 97]]

In [39]:
%%time
randa = random_matrix(600, 100)
randb = random_matrix(100, 600)
x = matmul(randa, randb)

CPU times: user 10.7 s, sys: 65.8 ms, total: 10.8 s
Wall time: 10.9 s


In [40]:
%%time
aleat1 = matriz_aleatoria(600,100)
aleat2 = matriz_aleatoria(100,600)
multiplicarmatrices(aleat1,aleat2)

CPU times: user 9.69 s, sys: 49.2 ms, total: 9.73 s
Wall time: 9.76 s


In [41]:
# Maybe that's not that bad?  Let's try a simpler case.
def python_dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [42]:
def producto_punto(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [43]:
pip install -U fortran-magic

Requirement already up-to-date: fortran-magic in /usr/local/lib/python3.7/dist-packages (0.7)


In [44]:
%reload_ext fortranmagic

  self._lib_dir = os.path.join(get_ipython_cache_dir(), 'fortran')


In [45]:
%%fortran
subroutine fortran_dot_product(xs, ys, result)
    double precision, intent(in) :: xs(:)
    double precision, intent(in) :: ys(:)
    double precision, intent(out) :: result
    
    result = sum(xs * ys)
end

In [46]:
list_data = [float(i) for i in range(100000)]
array_data = np.array(list_data)

In [47]:
lista_datos = [float(i) for i in range(1000000)]
arreglo_datos = np.array(lista_datos)

In [48]:
%%time
python_dot_product(list_data, list_data)

CPU times: user 10.7 ms, sys: 992 µs, total: 11.7 ms
Wall time: 12.4 ms


333328333350000.0

In [49]:
%%time
producto_punto(arreglo_datos,arreglo_datos)

CPU times: user 486 ms, sys: 2.08 ms, total: 488 ms
Wall time: 489 ms


3.3333283333312755e+17

In [50]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 185 µs, sys: 5 µs, total: 190 µs
Wall time: 198 µs


333328333350000.0

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/sloth.gif?raw=1" alt="Drawing" style="width: 1080px;"/></center>


## Why is the Python Version so Much Slower?

In [51]:
# Dynamic typing.
def mul_elemwise(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

mul_elemwise([1, 2, 3, 4], [1, 2 + 0j, 3.0, 'four'])
#[type(x) for x in _]

[1, (4+0j), 9.0, 'fourfourfourfour']

In [52]:
def multiplicacion_inteligente(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

multiplicacion_inteligente([0, -3,-453, 4], ['hola', 2.9, 3+2j, 'four'])

['', -8.7, (-1359-906j), 'fourfourfourfour']

In [53]:
# Interpretation overhead.
source_code = 'a + b * c'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)
bytecode

  1           0 LOAD_NAME                0 (a)
              2 LOAD_NAME                1 (b)
              4 LOAD_NAME                2 (c)
              6 BINARY_MULTIPLY
              8 BINARY_ADD
             10 RETURN_VALUE


<code object <module> at 0x7fbdd08669c0, file "", line 1>

In [54]:
source_code = '5*3+a'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)
bytecode

  1           0 LOAD_CONST               0 (15)
              2 LOAD_NAME                0 (a)
              4 BINARY_ADD
              6 RETURN_VALUE


<code object <module> at 0x7fbdd447bae0, file "", line 1>

## Why is the Python Version so Slow?
- Dynamic typing means that every single operation requires dispatching on the input type.
- Having an interpreter means that every instruction is fetched and dispatched at runtime.
- Other overheads:
  - Arbitrary-size integers.
  - Reference-counted garbage collection.

> This is the paradox that we have to work with when we're doing scientific or numerically-intensive Python. What makes Python fast for development -- this high-level, interpreted, and dynamically-typed aspect of the language -- is exactly what makes it slow for code execution.

- Jake VanderPlas, [*Losing Your Loops: Fast Numerical Computing with NumPy*](https://www.youtube.com/watch?v=EEUXKG97YRw)

# What Do We Do?

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/runaway.gif?raw=1" alt="Drawing" style="width: 50%;"/></center>

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/thisisfine.gif?raw=1" alt="Drawing" style="width: 1080px;"/></center>

- Python is slow for numerical computation because it performs dynamic dispatch on every operation we perform...

- ...but often, we just want to do the same thing over and over in a loop!

- If we don't need Python's dynamicism, we don't want to pay (much) for it.

- **Idea:** Dispatch **once per operation** instead of **once per element**.

In [55]:
import numpy as np

data = np.array([1, 2, 3, 4])
data

array([1, 2, 3, 4])

In [56]:
datos = np.array([3,14,15,29])

In [57]:
data + data

array([2, 4, 6, 8])

In [58]:
%%time
# Naive dot product
(array_data * array_data).sum()

CPU times: user 604 µs, sys: 0 ns, total: 604 µs
Wall time: 615 µs


333328333350000.0

In [59]:
%%time 
(datos * datos).sum()

CPU times: user 69 µs, sys: 2 µs, total: 71 µs
Wall time: 75.6 µs


1271

In [60]:
%%time
# Built-in dot product.
array_data.dot(array_data)

CPU times: user 1.11 ms, sys: 4.03 ms, total: 5.14 ms
Wall time: 4.64 ms


333328333350000.0

In [61]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 196 µs, sys: 6 µs, total: 202 µs
Wall time: 210 µs


333328333350000.0

In [62]:
# Numpy won't allow us to write a string into an int array.
#data[0] = "foo"

In [63]:
#datos[1] = "pi"

In [64]:
# We also can't grow an array once it's created.
#data.append(3)

In [65]:
#datos.append("pi")

In [66]:
# We **can** reshape an array though.
two_by_two = data.reshape(2, 2)
two_by_two

array([[1, 2],
       [3, 4]])

In [67]:
datos.reshape(2,2)

array([[ 3, 14],
       [15, 29]])

Numpy arrays are:

- Fixed-type

- Size-immutable

- Multi-dimensional

- Fast\*

\* If you use them correctly.

# What's in an Array?

In [68]:
arr = np.array([1, 2, 3, 4, 5, 6], dtype='int16').reshape(2, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

Array:
[[1 2 3]
 [4 5 6]]
DType: int16
Shape: (2, 3)
Strides: (6, 2)
Data: b'\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00'


In [69]:
arreglo = np.array([3, 14, 15, 29, 32, 61], dtype='int32').reshape(3, 2)
print("Array:\n", arreglo, sep='')
print("===========")
print("DType:", arreglo.dtype)
print("Shape:", arreglo.shape)
print("Strides:", arreglo.strides)
print("Data:", arreglo.data.tobytes())

Array:
[[ 3 14]
 [15 29]
 [32 61]]
DType: int32
Shape: (3, 2)
Strides: (8, 4)
Data: b'\x03\x00\x00\x00\x0e\x00\x00\x00\x0f\x00\x00\x00\x1d\x00\x00\x00 \x00\x00\x00=\x00\x00\x00'


# Core Operations

- Vectorized **ufuncs** for elementwise operations.
- Fancy indexing and masking for selection and filtering.
- Aggregations across axes.
- Broadcasting

# UFuncs

UFuncs (universal functions) are functions that operate elementwise on one or more arrays.

In [70]:
data = np.arange(15).reshape(3, 5)
data

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [71]:
datos = np.arange(25).reshape(5, 5)
datos

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [72]:
# Binary operators.
data * data

array([[  0,   1,   4,   9,  16],
       [ 25,  36,  49,  64,  81],
       [100, 121, 144, 169, 196]])

In [73]:
datos * datos

array([[  0,   1,   4,   9,  16],
       [ 25,  36,  49,  64,  81],
       [100, 121, 144, 169, 196],
       [225, 256, 289, 324, 361],
       [400, 441, 484, 529, 576]])

In [74]:
# Unary functions.
np.sqrt(data)

array([[0.        , 1.        , 1.41421356, 1.73205081, 2.        ],
       [2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ],
       [3.16227766, 3.31662479, 3.46410162, 3.60555128, 3.74165739]])

In [75]:
np.sqrt(datos)

array([[0.        , 1.        , 1.41421356, 1.73205081, 2.        ],
       [2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ],
       [3.16227766, 3.31662479, 3.46410162, 3.60555128, 3.74165739],
       [3.87298335, 4.        , 4.12310563, 4.24264069, 4.35889894],
       [4.47213595, 4.58257569, 4.69041576, 4.79583152, 4.89897949]])

In [76]:
# Comparison operations
(data % 3) == 0

array([[ True, False, False,  True, False],
       [False,  True, False, False,  True],
       [False, False,  True, False, False]])

In [77]:
(datos % 5) == 1

array([[False,  True, False, False, False],
       [False,  True, False, False, False],
       [False,  True, False, False, False],
       [False,  True, False, False, False],
       [False,  True, False, False, False]])

In [78]:
# Boolean combinators.
((data % 2) == 0) & ((data % 3) == 0)

array([[ True, False, False, False, False],
       [False,  True, False, False, False],
       [False, False,  True, False, False]])

In [79]:

((datos % 5) == 1) | ((datos % 5) == 3)

array([[False,  True, False,  True, False],
       [False,  True, False,  True, False],
       [False,  True, False,  True, False],
       [False,  True, False,  True, False],
       [False,  True, False,  True, False]])

In [80]:
# as of python 3.5, @ is matrix-multiply
data @ data.T

array([[ 30,  80, 130],
       [ 80, 255, 430],
       [130, 430, 730]])

In [81]:
datos @ datos.T

array([[  30,   80,  130,  180,  230],
       [  80,  255,  430,  605,  780],
       [ 130,  430,  730, 1030, 1330],
       [ 180,  605, 1030, 1455, 1880],
       [ 230,  780, 1330, 1880, 2430]])

# UFuncs Review

- UFuncs provide efficient elementwise operations applied across one or more arrays.
- Arithmetic Operators (`+`, `*`, `/`)
- Comparisons (`==`, `>`, `!=`)
- Boolean Operators (`&`, `|`, `^`)
- Trigonometric Functions (`sin`, `cos`)
- Transcendental Functions (`exp`, `log`)

# Selections

We often want to perform an operation on just a subset of our data.

In [82]:
sines = np.sin(np.linspace(0, 3.14, 10))
cosines = np.cos(np.linspace(0, 3.14, 10))
sines

array([0.        , 0.34185385, 0.64251645, 0.86575984, 0.98468459,
       0.98496101, 0.8665558 , 0.64373604, 0.34335012, 0.00159265])

In [83]:
senos = np.sin(np.linspace(90, 270, 35))
cosenos = np.cos(np.linspace(90, 270, 35))
senos

array([ 0.89399666,  0.86559533,  0.05723908, -0.80269312, -0.93934918,
       -0.22959355,  0.68704007,  0.9846081 ,  0.39498335, -0.55054583,
       -0.99999917, -0.54839142,  0.3973509 ,  0.9850555 ,  0.68516418,
       -0.23210243, -0.9402304 , -0.80115264,  0.05981319,  0.86688364,
        0.89283832,  0.11429047, -0.76724016, -0.95743996, -0.28492715,
        0.64432264,  0.99299789,  0.44692064, -0.50185974, -0.99843347,
       -0.5953569 ,  0.34417305,  0.9735818 ,  0.72573315, -0.17604595])

In [84]:
# Slicing works with the same semantics as Python lists.
sines[0]

0.0

In [85]:
senos[-1]

-0.1760459464712114

In [86]:
sines[:3]  # First three elements  

array([0.        , 0.34185385, 0.64251645])

In [87]:
senos[:10]

array([ 0.89399666,  0.86559533,  0.05723908, -0.80269312, -0.93934918,
       -0.22959355,  0.68704007,  0.9846081 ,  0.39498335, -0.55054583])

In [88]:
sines[5:]  # Elements from 5 on.

array([0.98496101, 0.8665558 , 0.64373604, 0.34335012, 0.00159265])

In [89]:
senos[25:]

array([ 0.64432264,  0.99299789,  0.44692064, -0.50185974, -0.99843347,
       -0.5953569 ,  0.34417305,  0.9735818 ,  0.72573315, -0.17604595])

In [90]:
sines[::2]  # Every other element.

array([0.        , 0.64251645, 0.98468459, 0.8665558 , 0.34335012])

In [91]:
senos[::7]

array([ 0.89399666,  0.9846081 ,  0.68516418,  0.11429047, -0.50185974])

In [92]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.5)
print("sines[sines > 0.5]:\n", sines[sines > 0.5])

sines:
 [0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]
sines > 0.5:
 [False False  True  True  True  True  True  True False False]
sines[sines > 0.5]:
 [0.64251645 0.86575984 0.98468459 0.98496101 0.8665558  0.64373604]


In [93]:
print("senos:\n", senos)
print("senos < 0:\n", senos < 0)
print("senos[senos < 0]:\n", senos[senos < 0])

senos:
 [ 0.89399666  0.86559533  0.05723908 -0.80269312 -0.93934918 -0.22959355
  0.68704007  0.9846081   0.39498335 -0.55054583 -0.99999917 -0.54839142
  0.3973509   0.9850555   0.68516418 -0.23210243 -0.9402304  -0.80115264
  0.05981319  0.86688364  0.89283832  0.11429047 -0.76724016 -0.95743996
 -0.28492715  0.64432264  0.99299789  0.44692064 -0.50185974 -0.99843347
 -0.5953569   0.34417305  0.9735818   0.72573315 -0.17604595]
senos < 0:
 [False False False  True  True  True False False False  True  True  True
 False False False  True  True  True False False False False  True  True
  True False False False  True  True  True False False False  True]
senos[senos < 0]:
 [-0.80269312 -0.93934918 -0.22959355 -0.55054583 -0.99999917 -0.54839142
 -0.23210243 -0.9402304  -0.80115264 -0.76724016 -0.95743996 -0.28492715
 -0.50185974 -0.99843347 -0.5953569  -0.17604595]


In [94]:
# We index with lists/arrays of integers to select values at those indices.
print(sines)
sines[[0, 4, 7]]

[0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]


array([0.        , 0.98468459, 0.64373604])

In [95]:
print(senos)
senos[[-1,-15,-17]]

[ 0.89399666  0.86559533  0.05723908 -0.80269312 -0.93934918 -0.22959355
  0.68704007  0.9846081   0.39498335 -0.55054583 -0.99999917 -0.54839142
  0.3973509   0.9850555   0.68516418 -0.23210243 -0.9402304  -0.80115264
  0.05981319  0.86688364  0.89283832  0.11429047 -0.76724016 -0.95743996
 -0.28492715  0.64432264  0.99299789  0.44692064 -0.50185974 -0.99843347
 -0.5953569   0.34417305  0.9735818   0.72573315 -0.17604595]


array([-0.17604595,  0.89283832,  0.05981319])

In [96]:
# Index arrays are often used for sorting one or more arrays.
unsorted_data = np.array([1, 3, 2, 12, -1, 5, 2])

In [97]:
datos_desordenados = np.array([3,14,15,2,9,1,6,-5,3,4])

In [98]:
sort_indices = np.argsort(unsorted_data)
sort_indices

array([4, 0, 2, 6, 1, 5, 3])

In [99]:
indices_ordenados = np.argsort(datos_desordenados)
indices_ordenados

array([7, 5, 3, 0, 8, 9, 6, 4, 1, 2])

In [100]:
unsorted_data[sort_indices]

array([-1,  1,  2,  2,  3,  5, 12])

In [101]:
datos_desordenados[indices_ordenados]

array([-5,  1,  2,  3,  3,  4,  6,  9, 14, 15])

In [102]:
market_caps = np.array([12, 6, 10, 5, 6])  # Presumably in dollars?
assets = np.array(['A', 'B', 'C', 'D', 'E'])

In [103]:
precio = np.array([12, 6, 10])  # Presumably in dollars?
mercado = np.array(['MELI', 'AAPL', 'TSLA'])

In [104]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap = np.argsort(market_caps)
assets[sort_by_mcap]

array(['D', 'B', 'E', 'C', 'A'], dtype='<U1')

In [105]:
mercado_ordenado = np.argsort(precio)
mercado[mercado_ordenado]

array(['AAPL', 'TSLA', 'MELI'], dtype='<U4')

In [106]:
# Indexers are also useful for aligning data.
print("Dates:\n", repr(event_dates))
print("Values:\n", repr(event_values))
print("Calendar:\n", repr(calendar))

Dates:
 array(['2017-01-06', '2017-01-07', '2017-01-08'], dtype='datetime64[D]')
Values:
 array([10, 15, 20])
Calendar:
 array(['2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06',
       '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
       '2017-01-13', '2017-01-17', '2017-01-18', '2017-01-19',
       '2017-01-20', '2017-01-23', '2017-01-24', '2017-01-25',
       '2017-01-26', '2017-01-27', '2017-01-30', '2017-01-31',
       '2017-02-01'], dtype='datetime64[D]')


In [107]:
print("Fechas:\n", repr(event_dates))
print("Valores:\n", repr(event_values))
print("Calendario:\n", repr(calendar))

Fechas:
 array(['2017-01-06', '2017-01-07', '2017-01-08'], dtype='datetime64[D]')
Valores:
 array([10, 15, 20])
Calendario:
 array(['2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06',
       '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
       '2017-01-13', '2017-01-17', '2017-01-18', '2017-01-19',
       '2017-01-20', '2017-01-23', '2017-01-24', '2017-01-25',
       '2017-01-26', '2017-01-27', '2017-01-30', '2017-01-31',
       '2017-02-01'], dtype='datetime64[D]')


In [108]:
print("Raw Dates:", event_dates)
print("Indices:", calendar.searchsorted(event_dates))
print("Forward-Filled Dates:", calendar[calendar.searchsorted(event_dates)])

Raw Dates: ['2017-01-06' '2017-01-07' '2017-01-08']
Indices: [3 4 4]
Forward-Filled Dates: ['2017-01-06' '2017-01-09' '2017-01-09']


In [109]:
print("Fechas en bruto:", event_dates)
print("Índices:", calendar.searchsorted(event_dates))
print("Fechas:", calendar[calendar.searchsorted(event_dates)])

Fechas en bruto: ['2017-01-06' '2017-01-07' '2017-01-08']
Índices: [3 4 4]
Fechas: ['2017-01-06' '2017-01-09' '2017-01-09']


On multi-dimensional arrays, we can slice along each axis independently.

In [110]:
data = np.arange(25).reshape(5, 5)
data

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [111]:
datos = np.arange(9).reshape(3,3)
datos

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [112]:
data[:2, :2]  # First two rows and first two columns.

array([[0, 1],
       [5, 6]])

In [113]:
data[:2, [0, -1]]  # First two rows, first and last columns.

array([[0, 4],
       [5, 9]])

In [114]:
datos[1:,1:]

array([[4, 5],
       [7, 8]])

In [115]:
data[(data[:, 0] % 2) == 0]  # Rows where the first column is divisible by two.

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24]])

In [116]:
datos[(datos[0, ] % 3) == 0]  

array([[0, 1, 2]])

# Selections Review

- Indexing with an integer removes a dimension.
- Slicing operations work on Numpy arrays the same way they do on lists.
- Indexing with a boolean array filters to True locations.
- Indexing with an integer array selects indices along an axis.
- Multidimensional arrays can apply selections independently along different axes.

## Reductions

Functions that reduce an array to a scalar.

$Var(X) = \frac{1}{N}\sqrt{\sum_{i=1}^N (x_i - \bar{x})^2}$

In [117]:
def variance(x):
    return ((x - x.mean()) ** 2).sum() / len(x)

In [118]:

def redc(x):
  return ((x - x.mean()) ** 2).sum() / len(x)

In [119]:
variance(np.random.standard_normal(1000))

0.9870910500249881

In [120]:

redc(np.array([0,1,2,3,4,5]))

2.9166666666666665

- `sum()` and `mean()` are both **reductions**.

- In the simplest case, we use these to reduce an entire array into a single value...

In [121]:
data = np.arange(30)
data.mean()

14.5

In [122]:
datos = np.arange(22)
datos.mean()

10.5

- ...but we can do more interesting things with multi-dimensional arrays.

In [123]:
data = np.arange(30).reshape(3, 10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])

In [124]:
datos = np.arange(22).reshape(2,11)
datos

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]])

In [125]:
data.mean()

14.5

In [126]:
datos.mean()

10.5

In [127]:
data.mean(axis=0)

array([10., 11., 12., 13., 14., 15., 16., 17., 18., 19.])

In [128]:

datos.mean(axis=0)

array([ 5.5,  6.5,  7.5,  8.5,  9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5])

In [129]:
data.mean(axis=1)

array([ 4.5, 14.5, 24.5])

In [130]:
datos.mean(axis=1)

array([ 5., 16.])

## Reductions Review

- Reductions allow us to perform efficient aggregations over arrays.
- We can do aggregations over a single axis to collapse a single dimension.
- Many built-in reductions (`mean`, `sum`, `min`, `max`, `median`, ...).

# Broadcasting

In [131]:
row = np.array([1, 2, 3, 4])
column = np.array([[1], [2], [3]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

Row:
[1 2 3 4]
Column:
[[1]
 [2]
 [3]]


In [132]:
fila = np.array([3, 1, 4, 2])
columna = np.array([[0], [-2], [50]])
print("Fila:\n", fila, sep='\n')
print("Columna:\n", columna, sep='\n')

Fila:

[3 1 4 2]
Columna:

[[ 0]
 [-2]
 [50]]


In [133]:
row + column

array([[2, 3, 4, 5],
       [3, 4, 5, 6],
       [4, 5, 6, 7]])

In [134]:
fila + columna

array([[ 3,  1,  4,  2],
       [ 1, -1,  2,  0],
       [53, 51, 54, 52]])

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/broadcasting.png?raw=1" alt="Drawing" style="width: 60%;"/></center>

<h5>Source: http://www.scipy-lectures.org/_images/numpy_broadcasting.png</h5>

In [135]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data, sep='')
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

Data:
[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]]
Mean:
[10. 11. 12. 13. 14. 15. 16. 17. 18. 19.]
Data - Mean:
[[-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [ 10.  10.  10.  10.  10.  10.  10.  10.  10.  10.]]


In [136]:
print("Datos:\n", datos, sep='')
print("Media:\n", datos.mean(axis=0), sep='')
print("Datos - Media:\n", datos - datos.mean(axis=0), sep='')

Datos:
[[ 0  1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20 21]]
Media:
[ 5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5 14.5 15.5]
Datos - Media:
[[-5.5 -5.5 -5.5 -5.5 -5.5 -5.5 -5.5 -5.5 -5.5 -5.5 -5.5]
 [ 5.5  5.5  5.5  5.5  5.5  5.5  5.5  5.5  5.5  5.5  5.5]]


# Broadcasting Review

- Numpy operations can work on arrays of different dimensions as long as the arrays' shapes are still "compatible".
- Broadcasting works by "tiling" the smaller array along the missing dimension.
- The result of a broadcasted operation is always at least as large in each dimension as the largest array in that dimension.

# Numpy Review

- Numerical algorithms are slow in pure Python because the overhead dynamic dispatch dominates our runtime.

- Numpy solves this problem by:
  1. Imposing additional restrictions on the contents of arrays.
  2. Moving the inner loops of our algorithms into compiled C code.

- Using Numpy effectively often requires reworking an algorithms to use vectorized operations instead of for-loops, but the resulting operations are usually simpler, clearer, and faster than the pure Python equivalent.

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/unicorn.jpg?raw=1" alt="Drawing" style="width: 75%;"/></center>

Numpy is great for many things, but...

- Sometimes our data is equipped with a natural set of **labels**:
  - Dates/Times
  - Stock Tickers
  - Field Names (e.g. Open/High/Low/Close)

- Sometimes we have **more than one type of data** that we want to keep grouped together.
  - Tables with a mix of real-valued and categorical data.

- Sometimes we have **missing** data, which we need to ignore, fill, or otherwise work around.

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/panda-wrangling.gif?raw=1" alt="Drawing" style="width: 75%;"/></center>

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/pandas_logo.png?raw=1" alt="Drawing" style="width: 75%;"/></center>


Pandas extends Numpy with more complex data structures:

- `Series`: 1-dimensional, homogenously-typed, labelled array.
- `DataFrame`: 2-dimensional, semi-homogenous, labelled table.

Pandas also provides many utilities for: 
- Input/Output
- Data Cleaning
- Rolling Algorithms
- Plotting

# Selection in Pandas

In [137]:
s = pd.Series(index=['a', 'b', 'c', 'd', 'e'], data=[1, 2, 3, 4, 5])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [138]:
miserie = pd.Series(index=['l','u','i','s','m','i'], data = [1,2,3,4,5,6])
miserie

l    1
u    2
i    3
s    4
m    5
i    6
dtype: int64

In [139]:
# There are two pieces to a Series: the index and the values.
print("The index is:", s.index)
print("The values are:", s.values)

The index is: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
The values are: [1 2 3 4 5]


In [140]:

print("índices:", miserie.index)
print("valores:", miserie.values)

índices: Index(['l', 'u', 'i', 's', 'm', 'i'], dtype='object')
valores: [1 2 3 4 5 6]


In [141]:
# We can look up values out of a Series by position...
s.iloc[0]

1

In [142]:
miserie.iloc[-1]

6

In [143]:
# ... or by label.
s.loc['a']

1

In [144]:
miserie.loc['m']

5

In [145]:
# Slicing works as expected...
s.iloc[:2]

a    1
b    2
dtype: int64

In [146]:
miserie.iloc[:3]

l    1
u    2
i    3
dtype: int64

In [147]:
# ...but it works with labels too!
s.loc[:'c']

a    1
b    2
c    3
dtype: int64

In [148]:
miserie.loc[:'m']

l    1
u    2
i    3
s    4
m    5
dtype: int64

In [149]:
# Fancy indexing works the same as in numpy.
s.iloc[[0, -1]]

a    1
e    5
dtype: int64

In [150]:
miserie.iloc[[-3,-2]]

s    4
m    5
dtype: int64

In [151]:
# As does boolean masking.
s.loc[s > 2]

c    3
d    4
e    5
dtype: int64

In [152]:
miserie.loc[miserie < 4]

l    1
u    2
i    3
dtype: int64

In [153]:
# Element-wise operations are aligned by index.
other_s = pd.Series({'a': 10.0, 'c': 20.0, 'd': 30.0, 'z': 40.0})
other_s

a    10.0
c    20.0
d    30.0
z    40.0
dtype: float64

In [154]:

otraserie = pd.Series({'pi':3.14,'e':2.71,'m':0})
otraserie

pi    3.14
e     2.71
m     0.00
dtype: float64

In [155]:
s + other_s

a    11.0
b     NaN
c    23.0
d    34.0
e     NaN
z     NaN
dtype: float64

In [156]:
miserie + otraserie

e     NaN
i     NaN
i     NaN
l     NaN
m     5.0
pi    NaN
s     NaN
u     NaN
dtype: float64

In [157]:
# We can fill in missing values with fillna().
(s + other_s).fillna(0.0)

a    11.0
b     0.0
c    23.0
d    34.0
e     0.0
z     0.0
dtype: float64

In [158]:
#miserie
(miserie + otraserie).fillna(0.1)

e     0.1
i     0.1
i     0.1
l     0.1
m     5.0
pi    0.1
s     0.1
u     0.1
dtype: float64

In [159]:
# Most real datasets are read in from an external file format.
aapl = pd.read_csv('AAPL.csv', parse_dates=['Date'], index_col='Date')
aapl.head()

FileNotFoundError: ignored

In [None]:
# Slicing generalizes to two dimensions as you'd expect:
aapl.iloc[:2, :2]

In [None]:
aapl.loc[pd.Timestamp('2010-02-01'):pd.Timestamp('2010-02-04'), ['Close', 'Volume']]

# Rolling Operations

<center><img src="https://github.com/ssanderson/pydata-toolbox/blob/master/notebooks/images/rolling.gif?raw=1" alt="Drawing" style="width: 75%;"/></center>

In [None]:
aapl.rolling(5)[['Close', 'Adj Close']].mean().plot();

In [None]:
# Drop `Volume`, since it's way bigger than everything else.
aapl.drop('Volume', axis=1).resample('2W').max().plot();

In [None]:
# 30-day rolling exponentially-weighted stddev of returns.
aapl['Close'].pct_change().ewm(span=30).std().plot();

# "Real World" Data

In [None]:
#from demos.avocados import read_avocadata

avocados = read_avocadata('2014', '2016')
avocados.head()

In [None]:
# Unlike numpy arrays, pandas DataFrames can have a different dtype for each column.
avocados.dtypes

In [None]:
# What's the regional average price of a HASS avocado every day?
hass = avocados[avocados.Variety == 'HASS']
hass.groupby(['Date', 'Region'])['Weighted Avg Price'].mean().unstack().ffill().plot();

In [None]:
def _organic_spread(group):

    if len(group.columns) != 2:
        return pd.Series(index=group.index, data=0.0)
    
    is_organic = group.columns.get_level_values('Organic').values.astype(bool)
    organics = group.loc[:, is_organic].squeeze()
    non_organics = group.loc[:, ~is_organic].squeeze()
    diff = organics - non_organics
    return diff

def organic_spread_by_region(df):
    """What's the difference between the price of an organic 
    and non-organic avocado within each region?
    """
    return (
        df
        .set_index(['Date', 'Region', 'Organic'])
         ['Weighted Avg Price']
        .unstack(level=['Region', 'Organic'])
        .ffill()
        .groupby(level='Region', axis=1)
        .apply(_organic_spread)
    )

In [None]:
organic_spread_by_region(hass).plot();
plt.gca().set_title("Daily Regional Organic Spread");
plt.legend(bbox_to_anchor=(1, 1));

In [None]:
spread_correlation = organic_spread_by_region(hass).corr()
spread_correlation

In [None]:
import seaborn as sns
grid = sns.clustermap(spread_correlation, annot=True)
fig = grid.fig
axes = fig.axes
ax = axes[2]
ax.set_xticklabels(ax.get_xticklabels(), rotation=45);

# Pandas Review

- Pandas extends numpy with more complex datastructures and algorithms.
- If you understand numpy, you understand 90% of pandas.
- `groupby`, `set_index`, and `unstack` are powerful tools for working with categorical data.
- Avocado prices are surprisingly interesting :)

# Thanks!