In [None]:
%matplotlib inline
!pip install -U fortran-magic
%load_ext fortranmagic

import sys; sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('figure', figsize=(12, 7))

ran_the_first_cell = True

jan2017 = pd.to_datetime(['2017-01-03 00:00:00+00:00',
 '2017-01-04 00:00:00+00:00',
 '2017-01-05 00:00:00+00:00',
 '2017-01-06 00:00:00+00:00',
 '2017-01-09 00:00:00+00:00',
 '2017-01-10 00:00:00+00:00',
 '2017-01-11 00:00:00+00:00',
 '2017-01-12 00:00:00+00:00',
 '2017-01-13 00:00:00+00:00',
 '2017-01-17 00:00:00+00:00',
 '2017-01-18 00:00:00+00:00',
 '2017-01-19 00:00:00+00:00',
 '2017-01-20 00:00:00+00:00',
 '2017-01-23 00:00:00+00:00',
 '2017-01-24 00:00:00+00:00',
 '2017-01-25 00:00:00+00:00',
 '2017-01-26 00:00:00+00:00',
 '2017-01-27 00:00:00+00:00',
 '2017-01-30 00:00:00+00:00',
 '2017-01-31 00:00:00+00:00',
 '2017-02-01 00:00:00+00:00'])
calendar = jan2017.values.astype('datetime64[D]')

event_dates = pd.to_datetime(['2017-01-06 00:00:00+00:00', 
                             '2017-01-07 00:00:00+00:00', 
                             '2017-01-08 00:00:00+00:00']).values.astype('datetime64[D]')
event_values = np.array([10, 15, 20])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  self._lib_dir = os.path.join(get_ipython_cache_dir(), 'fortran')


<center>
  <h1>The PyData Toolbox</h1>
  <h3>Scott Sanderson (Twitter: @scottbsanderson, GitHub: ssanderson)</h3>
  <h3><a href="https://github.com/ssanderson/pydata-toolbox">https://github.com/ssanderson/pydata-toolbox</a></h3>
</center>

# About Me:

<img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/me.jpg" alt="Drawing" style="width: 300px;"/>

- Senior Engineer at [Quantopian](www.quantopian.com)
- Background in Mathematics and Philosophy
- **Twitter:** [@scottbsanderson](https://twitter.com/scottbsanderson)
- **GitHub:** [ssanderson](github.com/ssanderson)

## Outline

- Built-in Data Structures
- Numpy `array`
- Pandas `Series`/`DataFrame`
- Plotting and "Real-World" Analyses

# Data Structures

> Rule 5. Data dominates. If you've chosen the right data structures and organized things well, the algorithms
will almost always be self-evident. Data structures, not algorithms, are central to programming.

- *Notes on Programming in C*, by Rob Pike.

# Lists

In [None]:
assert ran_the_first_cell, "Oh noes!"

In [None]:
l = [1, 'two', 3.0, 4, 5.0, "six"]
l

[1, 'two', 3.0, 4, 5.0, 'six']

**My own example 1**

In [None]:
Lista = [1,2,"juanita", "dilan",3]
Lista
juanita = Lista[2]
dilan = Lista[3]
print("est 1:", juanita)
print("est 2:", dilan)



est 1: juanita
est 2: dilan


In [None]:
# Lists can be indexed like C-style arrays.
first = l[0]
second = l[1]
print("first:", first)
print("second:", second)

first: 1
second: two


**My own example 2**

In [None]:
primero = Lista[0]
segundo = Lista[1]
print("uno:",primero)
print("dos:",segundo)

uno: 1
dos: 2


In [None]:
# Negative indexing gives elements relative to the end of the list.
last = l[-1]
penultimate = l[-2]
print("last:", last)
print("second to last:", penultimate)

last: six
second to last: 5.0


**My own example 3**

In [None]:
final = Lista[-1]
antes = Lista[-2]
print("last:", final)
print("before:",antes)

last: 3
before: dilan


In [None]:
# Lists can also be sliced, which makes a copy of elements between 
# start (inclusive) and stop (exclusive)
sublist = l[1:3]
sublist

['two', 3.0]

**My own example 4**

In [None]:
grupo_1 = Lista[0:3]
grupo_2 = Lista[3:]
print("grupo 1:", grupo_1)
print("grupo 2:", grupo_2)

grupo 1: [1, 2, 'juanita']
grupo 2: ['dilan', 3]


In [None]:
# l[:N] is equivalent to l[0:N].
first_three = l[:3]
first_three

[1, 'two', 3.0]

**My own example 5**

In [None]:
tres = Lista[:3]
print("primeros en lista:", tres)

primeros en lista: [1, 2, 'juanita']


In [None]:
# l[3:] is equivalent to l[3:len(l)].
after_three = l[3:]
after_three

[4, 5.0, 'six']

**My own example 6**

In [None]:
dos = Lista[3:]
print("ultimos en lista:", dos)

ultimos en lista: ['dilan', 3]


In [None]:
# There's also a third parameter, "step", which gets every Nth element.
l = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h']
l[1:7:2]

['b', 'd', 'f']

**My own example 7**

In [None]:
metodos = ["newton","raphson","minimos","cuadrados","solver","excel","mathlab"]
print(metodos[0:6:2])

['newton', 'minimos', 'solver']


In [None]:
# This is a cute way to reverse a list.
l[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

**My own example 8**

In [None]:
metodos[::-1]

['mathlab', 'excel', 'solver', 'cuadrados', 'minimos', 'raphson', 'newton']

In [None]:
# Lists can be grown efficiently (in O(1) amortized time).
l = [1, 2, 3, 4, 5]
print("Before:", l)
l.append('six')
print("After:", l)

Before: [1, 2, 3, 4, 5]
After: [1, 2, 3, 4, 5, 'six']


In [None]:
biblia = [1,2,3,4,5,6]
print("primeros capitulos",biblia)
biblia.append(7)
print("antiguo testamento:", biblia)

primeros capitulos [1, 2, 3, 4, 5, 6]
antiguo testamento: [1, 2, 3, 4, 5, 6, 7]


In [None]:
# Comprehensions let us perform elementwise computations.
l = [1, 2, 3, 4, 5]
[x * 2 for x in l]

[2, 4, 6, 8, 10]

**My own example 10**

In [None]:
multi_3 = [2,4,6,8,9]
[x*3 for x in multi_3]


[6, 12, 18, 24, 27]

## Review: Python Lists

- Zero-indexed sequence of arbitrary Python values.
- Slicing syntax: `l[start:stop:step]` copies elements at regular intervals from `start` to `stop`.
- Efficient (`O(1)`) appends and removes from end.
- Comprehension syntax: `[f(x) for x in l if cond(x)]`.

# Dictionaries

In [None]:
# Dictionaries are key-value mappings.
philosophers = {'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}
philosophers

{'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}

**My own example 11**

In [None]:
Harry_p = {"ron":"hermione","draco":"dobby","snape":"sirius"}
print(Harry_p)

{'ron': 'hermione', 'draco': 'dobby', 'snape': 'sirius'}


In [None]:
# Like lists, dictionaries are size-mutable.
philosophers['Ludwig'] = 'Wittgenstein'
philosophers

{'David': 'Hume',
 'Immanuel': 'Kant',
 'Bertrand': 'Russell',
 'Ludwig': 'Wittgenstein'}

**My own example 12**

In [None]:
Harry_p["bellatrix"] = "harry"
print(Harry_p)

{'ron': 'hermione', 'draco': 'dobby', 'snape': 'sirius', 'bellatrix': 'harry'}


In [None]:
del philosophers['David']
philosophers

{'Immanuel': 'Kant', 'Bertrand': 'Russell', 'Ludwig': 'Wittgenstein'}

**My own example 13**

In [None]:
del Harry_p["bellatrix"]
print(Harry_p)

{'ron': 'hermione', 'draco': 'dobby', 'snape': 'sirius'}


In [None]:
# No slicing.
import itertools
philosophers={"David":"Hume","Immanuel":"Kant","Bertrand":"Russel="}
dict(itertools.islice(philosophers.items(),1))

{'David': 'Hume'}

**My own example 14**

In [None]:
import itertools
Harry_p = {"ron":"hermione","draco":"dobby","snape":"sirius"}
dict(itertools.islice(Harry_p.items(),2))

{'ron': 'hermione', 'draco': 'dobby'}

## Review: Python Dictionaries

- Unordered key-value mapping from (almost) arbitrary keys to arbitrary values.
- Efficient (`O(1)`) lookup, insertion, and deletion.
- No slicing (would require a notion of order).

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/pacino.gif" alt="Drawing" style="width: 100%;"/></center>


In [None]:
# Suppose we have some matrices...
a = [[1, 2, 3],
     [2, 3, 4],
     [5, 6, 7],
     [1, 1, 1]]

b = [[1, 2, 3, 4],
     [2, 3, 4, 5]]

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/gross.gif" alt="Drawing" style="width: 50%;"/></center>


In [None]:
%%time

matmul(a, b)

CPU times: user 42 µs, sys: 8 µs, total: 50 µs
Wall time: 54.6 µs


[[5, 8, 11, 14], [8, 13, 18, 23], [17, 28, 39, 50], [3, 5, 7, 9]]

**My own example 15**

In [None]:
m1 = [[1,2,3],
      [4,5,6]]
m2 = [[7,8,9],
      [10,11,12]]
matmul(m1, m2)
  

[[27, 30, 33], [78, 87, 96]]

**My own example 16 - Changing in matmul(A, B) Python len(B) (# of rows of B) for len(A[0]) (# of columns of A)**

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(A[0])):
                out[i][j] += A[i][k] * B[k][j]
    return out

In [None]:
import random
def matriz(m, n):
    a = []
    for fila in range(m):
        a.append([random.random() for _ in range(n)])
    return a

m = matriz(3, 3)
m

[[0.17590320133983506, 0.43564418276487205, 0.8510276668322655],
 [0.6614458203222926, 0.6325068035270801, 0.3863939672437614],
 [0.979060228650388, 0.302215597959016, 0.5907128013840717]]

**My own example 17 - Verifiying error with in matmul(A, B) Python with the original matrices when changing len(B) (# of rows of B) for len(A[0]) (# of colums of A)**

In [None]:
import random
r = matriz(2,2)
print(r)
%time
ra = matriz(2, 3)
rb = matriz(3,2)
e = matmul(ra, rb)
print(e)

[[0.709850013875704, 0.6787437531853681], [0.929407461123102, 0.49626861559637325]]
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.58 µs
[[0.9393319322138486, 1.1596891870505104], [0.23023567403632889, 0.474092680939502]]


**My own example 18 - Chekcing the mtarix multiplication compatibility condition  len(A[0]) == len(B)**

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    if len(A[0])!=len(B):
      return 'No es posible multiplicar las matrices ya que, el número de columnas de A es diferente al número de filas de B'
    else:
      rows_out = len(A)
      cols_out = len(B[0])
      out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
      for i in range(rows_out):
          for j in range(cols_out):
              for k in range(len(A[0])):
                  out[i][j] += A[i][k] * B[k][j]
      return out

In [None]:
import random
#r = matriz(2,2)
#print(r)
%time
ra = matriz(3,2)
rb = matriz(2,3)
e = matmul(ra, rb)
print(e)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.63 µs
[[0.7143774118810629, 0.5346947352250949, 0.6407340496148386], [0.4732266134336496, 0.2866927503623971, 0.3205701694135017], [1.1236087771469125, 0.7506431382687506, 0.8687535794358869]]


**My own example 19 -  Verifiying error with in matmul(A, B) Python when checking the mtarix multiplication compatibility condition  len(A[0]) == len(B)**

In [None]:
%time
ra = matriz(3,2)
rb = matriz(3,3)
e = matmul(ra, rb)
print(e)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs
No es posible multiplicar las matrices ya que, el número de columnas de A es diferente al número de filas de B


**My own example 20 - Deifining A and B that are compatible for multiplcation**

In [None]:
A=matriz(4,5)
B=matriz(5,6)

**My own example 21 - Runinng the correct Python matrix multiplication code with the matrices with dimensions compatible for multiplication.**

In [None]:
m=matmul(A,B)
print(m)

[[0.855031737273622, 0.6958502223503967, 1.0485978115422254, 0.8452676738241904, 0.6513818059462282, 0.7149369732112779], [0.7034038868552861, 0.4508438756387884, 0.5822134047388681, 0.6757902267990392, 0.4958834829957527, 0.8059641079345872], [0.8009890484286395, 0.5905383517858133, 0.7191325200534138, 0.6728126318585346, 0.40211438871449473, 0.6426278224129982], [1.7194867758270027, 1.0408841705800982, 1.4000635461902766, 1.715255307788202, 1.2662075625807931, 1.484070904434014]]



**My own example 22 - Running 10 times matmul(randa, randb) with randa and randb a random matrices of 600 x 100 and 100 x 600 and calulating the average execution time**

In [None]:
import time
times=[]
for i in range(10):
  st=time.process_time()
  ra=matriz(600,100)
  rb=matriz(100,600)
  matmul(ra,rb)
  et=time.process_time()
  times.append(et-st)

**My own example 23 - Creating the average execution time data frame and adding Python's average execution time**

In [None]:
import numpy as np
import pandas as pd

mn_time1=np.mean(times)
df=pd.DataFrame()
df['Mean time']=None
df.loc['Python']=mn_time1
print(df)

        Mean time
Python  11.691242


**My own example 24 - Running 10 times randa and randb mutiplicaction as NumPy arrays  adding NumPy's average execution time**

In [None]:
times2=[]
for i in range(10):
  st=time.process_time()
  ra=np.random.rand(600,100)
  rb=np.random.rand(100,600)
  np.matmul(ra,rb)
  et=time.process_time()
  times2.append(et-st)

In [None]:
mn_time2=np.mean(times2)
df.loc['Numpy']=mn_time2
print(df)

        Mean time
Python  11.691242
Numpy    0.010081


In [None]:
%%time
randa = random_matrix(600, 100)
randb = random_matrix(100, 600)
x = matmul(randa, randb)

NameError: ignored

In [None]:
# Maybe that's not that bad?  Let's try a simpler case.
def python_dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [None]:
%%fortran
subroutine fortran_dot_product(xs, ys, result)
    double precision, intent(in) :: xs(:)
    double precision, intent(in) :: ys(:)
    double precision, intent(out) :: result
    
    result = sum(xs * ys)
end

In [None]:
list_data = [float(i) for i in range(100000)]
array_data = np.array(list_data)

In [None]:
%%time
python_dot_product(list_data, list_data)

CPU times: user 11.2 ms, sys: 11 µs, total: 11.2 ms
Wall time: 12.2 ms


333328333350000.0

In [None]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 209 µs, sys: 0 ns, total: 209 µs
Wall time: 219 µs


333328333350000.0

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/sloth.gif" alt="Drawing" style="width: 1080px;"/></center>


**My own example 25 - Deifining A (2x2)  and B (2x2)**

In [None]:
A = np.array([[4, 3], [2, 1]])
B = np.array([[5, 6], [7, 8]])
print(A)
print(B)

[[4 3]
 [2 1]]
[[5 6]
 [7 8]]


**My own example 26 - Defining Fortran subroutine matmul(A,B) for 2x2 matrices**

In [None]:
%%fortran
subroutine matmul(A, B, C)
  implicit None
  real(8), intent(in) :: A(2, 2), B(2, 2)
  real(8), intent(out) :: C(2, 2)
  integer :: i, j, k

  C = 0.0

  do i = 1, 2
    do j = 1, 2
      do k = 1, 2
        C(i, j) = C(i, j) + A(i, k) * B(k, j)
      end do
    end do
  end do
end subroutine matmul


**My own example 27 -Run Fortran subroutine matmul(A,B) with a and b 2x2 matrices**

In [None]:
C = matmul(A,B)

print("Matrix A =")
print(A)
print("Matrix B = ")
print(B)
print("Matrix C = ")
print(C)

Matrix A =
[[4 3]
 [2 1]]
Matrix B = 
[[5 6]
 [7 8]]
Matrix C = 
[[41. 48.]
 [17. 20.]]


**My own example 28 - Defining Fortran subroutine matmul(A,B) for 600x100 and 100x600 matrices**

In [None]:
%%fortran
subroutine matmul(A,B,C)
  implicit none
  real*8,intent(in) :: A(600,100), B(100,600)
  real*8,intent(out) :: C(600,600)
  integer :: i, j, k
  do i = 1, 600
      do j = 1, 600
          do k = 1, 100
              C(i,j) = C(i,j) + A(i,k) * B(k,j)
          end do
      end do
  end do
  return
end subroutine matmul

**My own example 29 -Run Fortran subroutine matmul(A,B) with 600x100 and 100x600 matrices**

In [None]:
A = np.random.rand(600, 100)
B = np.random.rand(100, 600)
C = matmul(A, B)

print("Matrix A =")
print(A[:5, :5])
print("Matrix B = ")
print(B[:5, :5])
print("Matrix C = ")
print(C[:5, :5])

Matrix A =
[[0.34708549 0.37411391 0.6488977  0.86990497 0.23519173]
 [0.86408287 0.33548762 0.5804884  0.13861287 0.35616514]
 [0.00165717 0.14973303 0.76295811 0.00914351 0.85361528]
 [0.0130042  0.06408967 0.90005704 0.33594515 0.08465435]
 [0.2143942  0.44829492 0.80214502 0.50221711 0.75349209]]
Matrix B = 
[[8.80971449e-01 8.58296947e-01 3.33014324e-01 4.25579829e-01
  8.54483118e-01]
 [4.77605293e-01 1.59007767e-01 9.44192114e-01 3.12011377e-01
  5.44482685e-01]
 [8.07732705e-01 4.51572665e-01 3.11572342e-01 3.07501811e-01
  4.77686643e-01]
 [8.28105342e-01 6.04052717e-01 2.46933193e-01 5.21264506e-01
  5.86022585e-01]
 [5.04899489e-01 7.74023567e-01 8.17154376e-04 2.25735130e-02
  9.90005651e-01]]
Matrix C = 
[[25.85042875 28.29183335 26.92051428 29.25742727 25.88277137]
 [25.15941879 26.05927232 25.70377039 28.57932657 25.42369735]
 [24.79041856 27.84812479 28.31625733 31.07354459 24.27561984]
 [22.83322051 24.25327041 24.60836171 27.86184543 22.93196282]
 [19.61208424 21.5633

**My own example 30 - Running 10 times the  Fortran subroutine matmul(A,B) with 600x100 and 100x600 matrices and adding Fortran magic average execution time to the data frame**

In [None]:
%%fortran
subroutine matmul(A,B,C)
  implicit none
  real, intent(in) :: A(600,100), B(100,600)
  real, intent(out) :: C(600,600)
  integer :: i, j, k

  C = 0.0

  do i=1,size(A,1)
    do j=1,size(B,2)
      do k=1,size(B,1)
        C(i,j) = C(i,j) + A(i,k)*B(k,j)
      end do
    end do
  end do

end subroutine matmul

In [None]:
n, m = 600, 100
A = np.random.rand(n, m)
B = np.random.rand(m, n)

times = []
for i in range(10):
    result = %timeit -r 1 -o -q matmul(A, B)
    times.append(result.best)

avg_time = np.mean(times)
print(avg_time)

**My own example 31 - Creating a  Fortran program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile matmul_10times.f90

program matmul_10times

implicit none

integer, parameter :: n = 600, m = 100
real*8 :: A(n,m), B(m,n), C(n,n)
integer :: i, j, k

call random_number(A)
call random_number(B)

do i = 1, 10
  C = 0.0
  do j = 1, n
    do k = 1, m
      C(j,:) = C(j,:) + A(j,k) * B(k,:)
    end do
  end do
end do

write(*,*) "Result of the last multiplication:"
write(*,*) C

end program matmul_10times

**My own example 32 - Running the Fortran program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
!gfortran -o matmul_10times matmul_10times.f90

In [None]:
!./matmul_10times

**My own example 33 - Adding Fortran average execution time to the data frame**

In [None]:
df.loc['Fortran']=avg_time
print(df)

**My own example 34 - Creating a c program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile matrixc.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define ROWS_A 600
#define COLS_A 100
#define ROWS_B 100
#define COLS_B 600

void multiply_matrices(double A[ROWS_A][COLS_A], double B[ROWS_B][COLS_B], double C[ROWS_A][COLS_B], double *average_time) {
    int i, j, k;
    double sum;
    clock_t start_time, end_time;

    
    start_time = clock();
    for (i = 0; i < ROWS_A; i++) {
        for (j = 0; j < COLS_B; j++) {
            sum = 0.0;
            for (k = 0; k < COLS_A; k++) {
                sum += A[i][k] * B[k][j];
            }
            C[i][j] = sum;
        }
    }
    end_time = clock();

    
    *average_time = (double)(end_time - start_time) / CLOCKS_PER_SEC / 10.0;
}

int main() {
    int i, j;
    double A[ROWS_A][COLS_A];
    double B[ROWS_B][COLS_B];
    double C[ROWS_A][COLS_B];
    double average_time;

    
    srand(time(NULL));
    for (i = 0; i < ROWS_A; i++) {
        for (j = 0; j < COLS_A; j++) {
            A[i][j] = (double)rand() / RAND_MAX;
        }
    }
    for (i = 0; i < ROWS_B; i++) {
        for (j = 0; j < COLS_B; j++) {
            B[i][j] = (double)rand() / RAND_MAX;
        }
    }

    
    multiply_matrices(A, B, C, &average_time);

    
    printf("%.6f", average_time);

    return 0;
}


**My own example 35 - Running the c program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
!gcc -o matrixc matrixc.c

In [None]:
!./matrixc

**My own example 21 - Adding c average execution time to the data frame**

In [None]:
#since the parameters are random everytime it runs it generates a diferent time here and in the running of the program
import subprocess

output = subprocess.check_output('./matrixc')

C_avgtime = output.decode('utf-8')

C_avgtime_float = float(C_avgtime)

print(C_avgtime_float)


In [None]:
df.loc['C'] = C_avgtime_float
print(df)

**My own example 22 - Creating a C++ program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile cpp.cpp
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

const int ROWS_A = 600;
const int COLS_A = 100;
const int ROWS_B = COLS_A;
const int COLS_B = ROWS_A;
const int NUM_ITERATIONS = 10;

int main() {
    double** A = new double*[ROWS_A];
    for (int i = 0; i < ROWS_A; i++) {
        A[i] = new double[COLS_A];
    }

    double** B = new double*[ROWS_B];
    for (int i = 0; i < ROWS_B; i++) {
        B[i] = new double[COLS_B];
    }

    
    srand(time(NULL));
    for (int i = 0; i < ROWS_A; i++) {
        for (int j = 0; j < COLS_A; j++) {
            A[i][j] = (double)rand() / RAND_MAX;
        }
    }
    for (int i = 0; i < ROWS_B; i++) {
        for (int j = 0; j < COLS_B; j++) {
            B[i][j] = (double)rand() / RAND_MAX;
        }
    }

    
    clock_t start_time = clock();
    for (int k = 0; k < NUM_ITERATIONS; k++) {
        double** C = new double*[ROWS_A];
        for (int i = 0; i < ROWS_A; i++) {
            C[i] = new double[COLS_B];
            for (int j = 0; j < COLS_B; j++) {
                C[i][j] = 0.0;
                for (int l = 0; l < COLS_A; l++) {
                    C[i][j] += A[i][l] * B[l][j];
                }
            }
        }
        
        for (int i = 0; i < ROWS_A; i++) {
            delete[] C[i];
        }
        delete[] C;
    }
    clock_t end_time = clock();

    
    double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
    cout << elapsed_time;

    
    for (int i = 0; i < ROWS_A; i++) {
        delete[] A[i];
    }
    delete[] A;
    for (int i = 0; i < ROWS_B; i++) {
        delete[] B[i];
    }
    delete[] B;

    return 0;
}

**My own example 23 - Running the C++ program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
!g++ -o cpp cpp.cpp

In [None]:
!./cpp

3.51177

**My own example 24 - Adding C++ average execution time to the data frame**

In [None]:
#since the parameters are random everytime it runs it generates a diferent time here and in the running of the program
# since it's running the program to calculate it
import subprocess

output = subprocess.check_output('./cpp')

Cpp_avgtime = output.decode('utf-8')

Cpp_avgtime_float = float(Cpp_avgtime)

print(Cpp_avgtime_float)

2.42927


In [None]:
df.loc['C++'] = Cpp_avgtime_float
print(df)

         Mean time
Python   11.691242
Numpy     0.010081
Fortran   0.098068
C         0.015856
C++       2.429270


**My own example 25 - Creating a Java program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile matrixj.java
import java.util.Random;

public class matrixj {
    public static void main(String[] args) {
        int[][] A = new int[600][100];
        int[][] B = new int[100][600];
        Random random = new Random();
        for (int i = 0; i < A.length; i++) {
            for (int j = 0; j < A[0].length; j++) {
                A[i][j] = random.nextInt(100);
            }
        }
        for (int i = 0; i < B.length; i++) {
            for (int j = 0; j < B[0].length; j++) {
                B[i][j] = random.nextInt(100);
            }
        }

        long totalTime = 0;
        for (int i = 0; i < 10; i++) {
            long startTime = System.currentTimeMillis();
            int[][] C = multiplyMatrices(A, B);
            long endTime = System.currentTimeMillis();
            totalTime += (endTime - startTime);
        }
        double averageTime = totalTime / 10.0;

        System.out.print(averageTime / 1000.0);

    }

    public static int[][] multiplyMatrices(int[][] A, int[][] B) {
        int m1 = A.length;
        int n1 = A[0].length;
        int m2 = B.length;
        int n2 = B[0].length;
        if (n1 != m2) {
            throw new IllegalArgumentException("The number of columns of the first matrix must match the number of rows of the second matrix");
        }
        int[][] C = new int[m1][n2];
        for (int i = 0; i < m1; i++) {
            for (int j = 0; j < n2; j++) {
                int sum = 0;
                for (int k = 0; k < n1; k++) {
                    sum += A[i][k] * B[k][j];
                }
                C[i][j] = sum;
            }
        }
        return C;
    }
}

Overwriting matrixj.java


**My own example 26 - Running the Java program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
!javac matrixj.java

In [None]:
!java matrixj

0.0663

**My own example 27 - Adding Java average execution time to the data frame**

In [None]:
import subprocess

command = ['java', 'matrixj']
output_bytes = subprocess.check_output(command)

java_avgtime = output_bytes.decode('utf-8')

java_avgtime_float = float(java_avgtime)

print(java_avgtime_float)

0.0577


In [None]:
df.loc['Java'] = java_avgtime_float
print(df)

         Mean time
Python   11.691242
Numpy     0.010081
Fortran   0.098068
C         0.015856
C++       2.429270
Java      0.057700


**My own example 28 - Creating a Javascript program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%javascript

function multiplyMatrices(A, B) {
  var rowsA = A.length, colsA = A[0].length,
      rowsB = B.length, colsB = B[0].length,
      C = [];

  if (colsA != rowsB) return false;

  for (var i = 0; i < rowsA; i++) C[i] = [];

  for (var k = 0; k < colsB; k++) {
    for (var i = 0; i < rowsA; i++) {
      var temp = 0;
      for (var j = 0; j < rowsB; j++) temp += A[i][j] * B[j][k];
      C[i][k] = temp;
    }
  }

  return C;
}

function createMatrix(rows, cols) {
  var matrix = [];
  for (var i = 0; i < rows; i++) {
    matrix[i] = [];
    for (var j = 0; j < cols; j++) {
      matrix[i][j] = Math.floor(Math.random() * 10);
    }
  }
  return matrix;
}

function computeAverageTime(n) {
  var A = createMatrix(600, 100);
  var B = createMatrix(100, 600);

  var startTime = new Date().getTime();
  for (var i = 0; i < n; i++) {
    multiplyMatrices(A, B);
  }
  var endTime = new Date().getTime();

  return (endTime - startTime) / n / 1000;
}

var averageTime = computeAverageTime(10);

var resultElement = document.createElement('div');

resultElement.innerHTML = averageTime;
document.body.appendChild(resultElement);

var htmlResult = resultElement.innerHTML;
console.log(htmlResult);

<IPython.core.display.Javascript object>

**My own example 29 - Adding Javascript average execution time to the data frame**

In [None]:
df.loc['Javascript'] = 0.1391
print(df)

            Mean time
Python      11.691242
Numpy        0.010081
Fortran      0.098068
C            0.015856
C++          2.429270
Java         0.057700
Javascript   0.139100


**My own example 30 - Finding the minimun average execuiton time in the data frame**

In [None]:
min_val = df['Mean time'].min()
min_label = df.loc[df['Mean time'] == min_val].index[0]
print("Minimum value is {} {}".format(min_val, min_label))

Minimum value is 0.01008116240000021 Numpy


**My own example 31 - Sorting the the data frame by average execution time**

In [None]:
df_sorted = df.sort_values(by='Mean time', ascending=True)

print(df_sorted)

            Mean time
Numpy        0.010081
C            0.015856
Java         0.057700
Fortran      0.098068
Javascript   0.139100
C++          2.429270
Python      11.691242


## Why is the Python Version so Much Slower?

In [None]:
# Dynamic typing.
def mul_elemwise(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

mul_elemwise([1, 2, 3, 4], [1, 2 + 0j, 3.0, 'four'])
#[type(x) for x in _]

[1, (4+0j), 9.0, 'fourfourfourfour']

In [None]:
# Interpretation overhead.
source_code = 'a + b * c'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)

  1           0 LOAD_NAME                0 (a)
              2 LOAD_NAME                1 (b)
              4 LOAD_NAME                2 (c)
              6 BINARY_MULTIPLY
              8 BINARY_ADD
             10 RETURN_VALUE


## Why is the Python Version so Slow?
- Dynamic typing means that every single operation requires dispatching on the input type.
- Having an interpreter means that every instruction is fetched and dispatched at runtime.
- Other overheads:
  - Arbitrary-size integers.
  - Reference-counted garbage collection.

> This is the paradox that we have to work with when we're doing scientific or numerically-intensive Python. What makes Python fast for development -- this high-level, interpreted, and dynamically-typed aspect of the language -- is exactly what makes it slow for code execution.

- Jake VanderPlas, [*Losing Your Loops: Fast Numerical Computing with NumPy*](https://www.youtube.com/watch?v=EEUXKG97YRw)

# What Do We Do?

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/runaway.gif" alt="Drawing" style="width: 50%;"/></center>

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/thisisfine.gif" alt="Drawing" style="width: 1080px;"/></center>

- Python is slow for numerical computation because it performs dynamic dispatch on every operation we perform...

- ...but often, we just want to do the same thing over and over in a loop!

- If we don't need Python's dynamicism, we don't want to pay (much) for it.

- **Idea:** Dispatch **once per operation** instead of **once per element**.

In [None]:
import numpy as np

data = np.array([1, 2, 3, 4])
data

array([1, 2, 3, 4])

In [None]:
data + data

array([2, 4, 6, 8])

In [None]:
%%time
# Naive dot product
(array_data * array_data).sum()

CPU times: user 905 µs, sys: 997 µs, total: 1.9 ms
Wall time: 1.85 ms


333328333350000.0

In [None]:
%%time
# Built-in dot product.
array_data.dot(array_data)

CPU times: user 2.75 ms, sys: 0 ns, total: 2.75 ms
Wall time: 3.22 ms


333328333350000.0

In [None]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 177 µs, sys: 2 µs, total: 179 µs
Wall time: 187 µs


333328333350000.0

In [None]:
# Numpy won't allow us to write a string into an int array.
data[0] = "foo"

ValueError: ignored

In [None]:
# We also can't grow an array once it's created.
data.append(3)

In [None]:
# We **can** reshape an array though.
two_by_two = data.reshape(2, 2)
two_by_two

Numpy arrays are:

- Fixed-type

- Size-immutable

- Multi-dimensional

- Fast\*

\* If you use them correctly.

# What's in an Array?

In [None]:
arr = np.array([1, 2, 3, 4, 5, 6], dtype='int16').reshape(2, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

# Core Operations

- Vectorized **ufuncs** for elementwise operations.
- Fancy indexing and masking for selection and filtering.
- Aggregations across axes.
- Broadcasting

# UFuncs

UFuncs (universal functions) are functions that operate elementwise on one or more arrays.

In [None]:
data = np.arange(15).reshape(3, 5)
data

In [None]:
# Binary operators.
data * data

In [None]:
# Unary functions.
np.sqrt(data)

In [None]:
# Comparison operations
(data % 3) == 0

In [None]:
# Boolean combinators.
((data % 2) == 0) & ((data % 3) == 0)

In [None]:
# as of python 3.5, @ is matrix-multiply
data @ data.T

# UFuncs Review

- UFuncs provide efficient elementwise operations applied across one or more arrays.
- Arithmetic Operators (`+`, `*`, `/`)
- Comparisons (`==`, `>`, `!=`)
- Boolean Operators (`&`, `|`, `^`)
- Trigonometric Functions (`sin`, `cos`)
- Transcendental Functions (`exp`, `log`)

# Selections

We often want to perform an operation on just a subset of our data.

In [None]:
sines = np.sin(np.linspace(0, 3.14, 10))
cosines = np.cos(np.linspace(0, 3.14, 10))
sines

In [None]:
# Slicing works with the same semantics as Python lists.
sines[0]

In [None]:
sines[:3]  # First three elements  

In [None]:
sines[5:]  # Elements from 5 on.

In [None]:
sines[::2]  # Every other element.

In [None]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.5)
print("sines[sines > 0.5]:\n", sines[sines > 0.5])

In [None]:
# We index with lists/arrays of integers to select values at those indices.
print(sines)
sines[[0, 4, 7]]

In [None]:
# Index arrays are often used for sorting one or more arrays.
unsorted_data = np.array([1, 3, 2, 12, -1, 5, 2])

In [None]:
sort_indices = np.argsort(unsorted_data)
sort_indices

In [None]:
unsorted_data[sort_indices]

In [None]:
market_caps = np.array([12, 6, 10, 5, 6])  # Presumably in dollars?
assets = np.array(['A', 'B', 'C', 'D', 'E'])

In [None]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap = np.argsort(market_caps)
assets[sort_by_mcap]

In [None]:
# Indexers are also useful for aligning data.
print("Dates:\n", repr(event_dates))
print("Values:\n", repr(event_values))
print("Calendar:\n", repr(calendar))

In [None]:
print("Raw Dates:", event_dates)
print("Indices:", calendar.searchsorted(event_dates))
print("Forward-Filled Dates:", calendar[calendar.searchsorted(event_dates)])

On multi-dimensional arrays, we can slice along each axis independently.

In [None]:
data = np.arange(25).reshape(5, 5)
data

In [None]:
data[:2, :2]  # First two rows and first two columns.

In [None]:
data[:2, [0, -1]]  # First two rows, first and last columns.

In [None]:
data[(data[:, 0] % 2) == 0]  # Rows where the first column is divisible by two.

# Selections Review

- Indexing with an integer removes a dimension.
- Slicing operations work on Numpy arrays the same way they do on lists.
- Indexing with a boolean array filters to True locations.
- Indexing with an integer array selects indices along an axis.
- Multidimensional arrays can apply selections independently along different axes.

## Reductions

Functions that reduce an array to a scalar.

$Var(X) = \frac{1}{N}\sqrt{\sum_{i=1}^N (x_i - \bar{x})^2}$

In [None]:
def variance(x):
    return ((x - x.mean()) ** 2).sum() / len(x)

In [None]:
variance(np.random.standard_normal(1000))

- `sum()` and `mean()` are both **reductions**.

- In the simplest case, we use these to reduce an entire array into a single value...

In [None]:
data = np.arange(30)
data.mean()

- ...but we can do more interesting things with multi-dimensional arrays.

In [None]:
data = np.arange(30).reshape(3, 10)
data

In [None]:
data.mean()

In [None]:
data.mean(axis=0)

In [None]:
data.mean(axis=1)

## Reductions Review

- Reductions allow us to perform efficient aggregations over arrays.
- We can do aggregations over a single axis to collapse a single dimension.
- Many built-in reductions (`mean`, `sum`, `min`, `max`, `median`, ...).

# Broadcasting

In [None]:
row = np.array([1, 2, 3, 4])
column = np.array([[1], [2], [3]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

In [None]:
row + column

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/broadcasting.png" alt="Drawing" style="width: 60%;"/></center>

<h5>Source: http://www.scipy-lectures.org/_images/numpy_broadcasting.png</h5>

In [None]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data, sep='')
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

# Broadcasting Review

- Numpy operations can work on arrays of different dimensions as long as the arrays' shapes are still "compatible".
- Broadcasting works by "tiling" the smaller array along the missing dimension.
- The result of a broadcasted operation is always at least as large in each dimension as the largest array in that dimension.

# Numpy Review

- Numerical algorithms are slow in pure Python because the overhead dynamic dispatch dominates our runtime.

- Numpy solves this problem by:
  1. Imposing additional restrictions on the contents of arrays.
  2. Moving the inner loops of our algorithms into compiled C code.

- Using Numpy effectively often requires reworking an algorithms to use vectorized operations instead of for-loops, but the resulting operations are usually simpler, clearer, and faster than the pure Python equivalent.

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/unicorn.jpg" alt="Drawing" style="width: 75%;"/></center>

Numpy is great for many things, but...

- Sometimes our data is equipped with a natural set of **labels**:
  - Dates/Times
  - Stock Tickers
  - Field Names (e.g. Open/High/Low/Close)

- Sometimes we have **more than one type of data** that we want to keep grouped together.
  - Tables with a mix of real-valued and categorical data.

- Sometimes we have **missing** data, which we need to ignore, fill, or otherwise work around.

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/panda-wrangling.gif" alt="Drawing" style="width: 75%;"/></center>

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/pandas_logo.png" alt="Drawing" style="width: 75%;"/></center>


Pandas extends Numpy with more complex data structures:

- `Series`: 1-dimensional, homogenously-typed, labelled array.
- `DataFrame`: 2-dimensional, semi-homogenous, labelled table.

Pandas also provides many utilities for: 
- Input/Output
- Data Cleaning
- Rolling Algorithms
- Plotting

# Selection in Pandas

In [None]:
s = pd.Series(index=['a', 'b', 'c', 'd', 'e'], data=[1, 2, 3, 4, 5])
s

In [None]:
# There are two pieces to a Series: the index and the values.
print("The index is:", s.index)
print("The values are:", s.values)

In [None]:
# We can look up values out of a Series by position...
s.iloc[0]

In [None]:
# ... or by label.
s.loc['a']

In [None]:
# Slicing works as expected...
s.iloc[:2]

In [None]:
# ...but it works with labels too!
s.loc[:'c']

In [None]:
# Fancy indexing works the same as in numpy.
s.iloc[[0, -1]]

In [None]:
# As does boolean masking.
s.loc[s > 2]

In [None]:
# Element-wise operations are aligned by index.
other_s = pd.Series({'a': 10.0, 'c': 20.0, 'd': 30.0, 'z': 40.0})
other_s

In [None]:
s + other_s

In [None]:
# We can fill in missing values with fillna().
(s + other_s).fillna(0.0)

In [None]:
# Most real datasets are read in from an external file format.
aapl = pd.read_csv('AAPL.csv', parse_dates=['Date'], index_col='Date')
aapl.head()

In [None]:
# Slicing generalizes to two dimensions as you'd expect:
aapl.iloc[:2, :2]

In [None]:
aapl.loc[pd.Timestamp('2010-02-01'):pd.Timestamp('2010-02-04'), ['Close', 'Volume']]

# Rolling Operations

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/rolling.gif" alt="Drawing" style="width: 75%;"/></center>

In [None]:
aapl.rolling(5)[['Close', 'Adj Close']].mean().plot();

In [None]:
# Drop `Volume`, since it's way bigger than everything else.
aapl.drop('Volume', axis=1).resample('2W').max().plot();

In [None]:
# 30-day rolling exponentially-weighted stddev of returns.
aapl['Close'].pct_change().ewm(span=30).std().plot();

# "Real World" Data

In [None]:
from google.colab import files

demos=files.upload()

**My own example...**


In [None]:
import io
expo=files.upload()

In [None]:
df2 = pd.read_csv(io.BytesIO(expo['produccion.csv']))
print(df2)

In [None]:
from avocados import read_avocadata

avocados = read_avocadata('2014', '2016')
avocados.head()

In [None]:
# Unlike numpy arrays, pandas DataFrames can have a different dtype for each column.
avocados.dtypes

**My own example...**

In [None]:
df2=df2.rename({'Área Sembrada\n(ha)':'Área_Sembrada\n(ha)','Área Cosechada\n(ha)':'Área_Cosechada\n(ha)','CICLO DE CULTIVO':'CICLO_DE_CULTIVO',
               'GRUPO \nDE CULTIVO':'Grupo', 'SUBGRUPO \nDE CULTIVO':'Subgrupo'},axis=1)
df2.dtypes

In [None]:
# What's the regional average price of a HASS avocado every day?
hass = avocados[avocados.Variety == 'HASS']
hass.groupby(['Date', 'Region'])['Weighted Avg Price'].mean().unstack().ffill().plot();

**My own example...**

In [None]:
#Producción de cultivos en Antioquia
antioquia = df2[df2.DEPARTAMENTO == 'ANTIOQUIA']
antioquia.groupby(['AÑO', 'GRUPO \nDE CULTIVO'])['Producción\n(t)'].mean().unstack().ffill().plot(title='Producción de cultivos en Antioquia',figsize=(20, 10));

In [None]:
#Producción de cultivos en Cundinamarca
cund = df2[df2.DEPARTAMENTO == 'CUNDINAMARCA']
cund.groupby(['AÑO', 'GRUPO \nDE CULTIVO'])['Producción\n(t)'].mean().unstack().ffill().plot(title='Producción de cultivos en Cundinamarca',figsize=(20, 10));

In [None]:
#Producción de cultivos en Boyacá
boy = df2[df2.DEPARTAMENTO == 'BOYACA']
boy.groupby(['AÑO', 'GRUPO \nDE CULTIVO'])['Producción\n(t)'].mean().unstack().ffill().plot(title='Producción de cultivos en Boyacá',figsize=(20, 10));

In [None]:
df2.describe()

In [None]:
def _organic_spread(group):

    if len(group.columns) != 2:
        return pd.Series(index=group.index, data=0.0)
    
    is_organic = group.columns.get_level_values('Organic').values.astype(bool)
    organics = group.loc[:, is_organic].squeeze()
    non_organics = group.loc[:, ~is_organic].squeeze()
    diff = organics - non_organics
    return diff

def organic_spread_by_region(df):
    """What's the difference between the price of an organic 
    and non-organic avocado within each region?
    """
    return (
        df
        .set_index(['Date', 'Region', 'Organic'])
         ['Weighted Avg Price']
        .unstack(level=['Region', 'Organic'])
        .ffill()
        .groupby(level='Region', axis=1)
        .apply(_organic_spread)
    )

In [None]:
spread_correlation = organic_spread_by_region(hass).corr()
spread_correlation

In [None]:
import seaborn as sns
grid = sns.clustermap(spread_correlation, annot=True)
fig = grid.fig
axes = fig.axes
ax = axes[2]
ax.set_xticklabels(ax.get_xticklabels(), rotation=45);

**My own example...**

In [None]:
#Producción de plantas arómaticas, condimentarias y medicinales por subgrupo
prod_princ=df2[df2.Grupo == 'PLANTAS AROMATICAS, CONDIMENTARIAS Y MEDICINALES'] 
graph=sns.violinplot(data= prod_princ, x='Subgrupo', y='Producción\n(t)')

# Pandas Review

- Pandas extends numpy with more complex datastructures and algorithms.
- If you understand numpy, you understand 90% of pandas.
- `groupby`, `set_index`, and `unstack` are powerful tools for working with categorical data.
- Avocado prices are surprisingly interesting :)

# Thanks!