#    Introduction to NumPy
Learning NumPy!

In [1]:
import matplotlib.pyplot as plt
import numpy as np

## Differences betweens lists and NumPy arrays

* An array's size is immutable. You cannot append, insert or remove elements, like you can with a list. 
* All of an arrays element must be the same data type
* A Numpy array behaves in a Pythonic fashion. You can `len(my_array)` just like you would assume. 

In [2]:
gpas_as_list = [4.0, 3.286, 3.5]

In [3]:
# can have elements appended to it 
gpas_as_list.append(4.0)
# can have multiple datatypes in it
gpas_as_list.insert(1, "whatevs")
# can have items removed
gpas_as_list.pop(1)

'whatevs'

In [4]:
gpas_as_list

[4.0, 3.286, 3.5, 4.0]

In [5]:
gpas = np.array(gpas_as_list)

In [6]:
?gpas

In [7]:
gpas.dtype

dtype('float64')

In [8]:
study_minutes = np.zeros(100, np.uint16)
study_minutes

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint16)

In [9]:
%whos

Variable        Type       Data/Info
------------------------------------
gpas            ndarray    4: 4 elems, type `float64`, 32 bytes
gpas_as_list    list       n=4
np              module     <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
plt             module     <module 'matplotlib.pyplo<...>\\matplotlib\\pyplot.py'>
study_minutes   ndarray    100: 100 elems, type `uint16`, 200 bytes


In [10]:
students_gpas = np.array([
    [4.0, 3.286, 3.5, 4.0],
    [3.2, 3.8, 4.0, 4.0],
    [3.96, 3.92, 4.0, 4.0]
], np.float16)
students_gpas

array([[4.   , 3.285, 3.5  , 4.   ],
       [3.2  , 3.8  , 4.   , 4.   ],
       [3.96 , 3.92 , 4.   , 4.   ]], dtype=float16)

In [11]:
students_gpas.ndim

2

In [12]:
students_gpas.shape

(3, 4)

In [13]:
np.info(students_gpas)

class:  ndarray
shape:  (3, 4)
strides:  (8, 2)
itemsize:  2
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x1394d703dd0
byteorder:  little
byteswap:  False
type: float16


In [14]:
students_gpas[1]

array([3.2, 3.8, 4. , 4. ], dtype=float16)

In [15]:
students_gpas[1][2]

4.0

## About data types

* By choosing the proper data type you can greatly reduce the size required to store objects
* Data types are maintained by wrapping values in a scaler representation
* np.zeros() is a handy way to create an empty array filled with zero's

In [16]:
study_minutes = np.zeros(100, np.uint16)
study_minutes[0] = 150
first_day_minutes = study_minutes[0]
print(first_day_minutes)
type(first_day_minutes)

150


numpy.uint16

In [17]:
study_minutes[1] = 60
study_minutes

array([150,  60,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=uint16)

In [18]:
study_minutes[2:6] = [80, 30, 60, 90]
study_minutes

array([150,  60,  80,  30,  60,  90,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=uint16)

# Multidimensional Arrays
* The data structure is actually called `ndarry`, representing any **n**umber of __d__imensions
* Arrays can have multiple dimensions, you can declare them on creation
* Dimensions help define what each element in the array represents. A two dimensional array is just an array of arrays
* **Rank** defines how many dimensions an array contains
* **Shape** defines the length of each of the arrays dimensions
* Each dimension is also referred to as an **axis** and are zero-indexed. Multiple of these are referred to as **axes**
* A 2D array is also known as a matrix

# Creation
* You can create but bound grouping of values using the `np.random` function
    * `RandomState` is the same as `set.seed()` from R 
* You can append a row in a couple of ways: 
    * You can use the `np.append` method if the new row is the same shape
    * You can create / reassign a new array by including the existing array as part of the iterable in creatoin

# Indexing
* You can use an indexing shortcut by seperating dimensions with a comma
* You can index using a list or a `np.array()`. Values will be pulled out at the specific index. This is known as fancy indexing.
    * Resulting array shape matches the index array layout. Be careful to distinguish between the tuple shortcut and fancy indexing
    

In [25]:
study_minutes = np.array([
    study_minutes,
    np.zeros(100), np.uint16
])


In [20]:
study_minutes[1][0] # This is equal to study_minutes(1, 0)
study_minutes

array([array([150,  60,  80,  30,  60,  90,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=uint16),
       array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 

In [24]:
rand = np.random.RandomState(42)
fake_log = rand.randint(30, 180, size = 100, dtype = np.uint16)
fake_log, study_minutes

(array([132, 122, 128,  44, 136, 129, 101,  95,  50, 132, 151,  64, 104,
        175, 117, 146, 139, 129, 133, 176,  98, 160, 179,  99,  82, 142,
         31, 106, 117,  56,  98,  67, 121, 159,  81, 170,  31,  50,  49,
         87, 179,  51, 116, 177, 118,  78, 171, 117,  88, 123, 102,  44,
         79,  31, 108,  80,  59, 137,  84,  93, 155, 160,  67,  80, 166,
        164,  70,  50, 102, 113,  47, 131, 161, 118,  82,  89,  81,  43,
         81,  38, 119,  52,  82,  31, 159,  57, 113,  71, 121, 140,  91,
         70,  37, 106,  64, 127, 110,  58,  93,  79], dtype=uint16),
 array([array([150,  60,  80,  30,  60,  90,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0, 

In [23]:
study_minutes = np.append(study_minutes, [fake_log], axis=0)


ValueError: all the input arrays must have same number of dimensions

# Boolean Array Indexing
* You can create a boolean array by using comparison operators on an array
    * You can use boolean arrays for fancy indexing
    * Boolean arrays cane be compared by using bitwise operators (`&`, `|`)
        * Do not use the `and` keyword
        * Remember the order of operation when combining
* Even though boolean indexing returns a new array, you can update the existing array using a boolean index

In [None]:
# subsetting like in R
fake_log[fake_log < 60]

In [None]:
# can create boolean indexers using arrays to subset for multiple conditions
np.array([True, False, True]) & np.array([False, True, True])

In [None]:
# Boolean array indexing
fake_log[np.logical_and(fake_log < 60 , fake_log >= 0)] = 10
fake_log

# Slicing

* Works a lot like normal list slicing
* You can use commas to seperate each dimension slice
* Always returns a data view and not a copy
* You can access the base property by the `ndarray.base()` method

In [None]:
# slicing a list returns a copy
fruit = ["apple", "banana", "cherry", "durian"]
fruit[1:3]
copied = fruit[:]
copied[3] = "cheese"
copied

In [None]:
practice = np.arange(42)
practice.shape = (7, 6)
practice

In [None]:
practice[2]

In [None]:
practice[2,1]

In [None]:
# any slicing of ndarray returns a view and not a copy!
not_copied = practice[:]
not_copied

# how to check if it is a copy or a view
practice.base is None, not_copied.base is None

In [None]:
practice_view = practice.reshape(3,14)
practice, practice_view

In [None]:
# alt + enter on this to search NumPy for documentation
# np.lookfor("flat")
# the ".T" dot method is transpose

# Linear algebra
* There is a module for linear algebra, `linalg`
* You can solve a system of equations using `.solve()`
    * You can create a 2 dimensional matrix and a target row vector and solve for each variable column
    * You can double check the answer using the inner product or dot
* You can use `@` to produce the dot product of two arrays

In [None]:
orders = np.array([
    [2,0,0,0],
    [4,1,2,2],
    [0,1,0,1],
    [6,0,1,2]
])
totals = np.array([3, 20.50, 10, 14.25])
price = np.linalg.solve(orders, totals)
price

# Universal Functions
* ufuncs are commonly needed vectorised functions
    * Vectorised functions allow you to operate element by element without using a loop
* The standard math and comparison operate have been overloaded so that they can all make use for vectorisation
* Values can be broadcasted, or stretched, to be applied to the ufuncs

# Common Routines
* Common mathematical routines are exposed so the formula can be abstracted away
    * `mean` is a statistical routine used to calculate the average
* Reduction functions take a dimension and collapse it into a single value
    * These functions define an axis parameter, and you should remember that the function works across the dimension
    

In [None]:
# A ● B = C
orders @ price == totals

In [None]:
a, b = np.split(np.arange(1, 11), 2)
a, b

In [None]:
students_gpas, students_gpas.mean(axis=1) # 1 for rows, 0 for columns

In [None]:

study_minutes[np.logical_and(study_minutes > 0)]
