# CHAPTER 11 - NUMPY AND PANDAS

In [1]:
import numpy as np
import pandas as pd

## NUMPY

### Description

- Ref: https://docs.scipy.org/doc/numpy/user/whatisnumpy.html
- Ref: https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf
- NumPy provides a multidimensional array object. 
- Each object comes with an assortment of routines for fast operations on arrays, including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms, basic linear algebra, basic statistical operations, random simulation and much more.
- Basically, you can use NumPy to create ndarray (N-dimensional array) and easily manipulate the array. It is super fast because it is written in C.

### How is it different from Python arrays?

- Python lists can be modified -- you can add and remove elements. NumPy arrays have a fixed size at creation. 
- Python lists can contain different data types. NumPy arrays can only have one data type. If you put in mixed types, they become a string. 
- NumPy arrays come prepackaged with advanced mathematical operations. The operations are super fast even on large numbers of data and they use less memory.

### Why use NumPy

- Most data analysis programs use NumPy to manipulate data. They might take in data as standard Python lists, but they convert it to a NumPy array and manipulate the data using NumPy routines and output the transformed data as a NumPy array. 
- NumPy data array is the main data type used in most scientific and mathematical Python-based packages.

In [2]:
# Square a list using Python
squared_values = []
for number in range(10):
    squared_values.append(number*number)

print(squared_values)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [3]:
vector = np.array(range(10))
scalar = 5
print(vector * scalar)
print(vector * vector)

[ 0  5 10 15 20 25 30 35 40 45]
[ 0  1  4  9 16 25 36 49 64 81]


### Numpy Basics

- NumPy arrays can be a 1-D array, called a vector, or a 2-D array, called a matrix 

#### NumPy casting -- covert Python list to a NumPy array

In [4]:
my_list = [1, 2, 3]
print(my_list)

my_vector = np.array(my_list)
print(my_vector)

my_matrix = np.array([my_list, my_list])
np.hstack([my_list, my_list])

my_nested_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
my_matrix = np.array(my_nested_list)
print(my_matrix)

my_list1 = [[1,2], [3,4]]
my_list2 = [[5,6], [7,8]]  
np.hstack([my_list1, my_list2])   
np.vstack([my_list1, my_list2])

[1, 2, 3]
[1 2 3]
[[1 2 3]
 [4 5 6]
 [7 8 9]]


array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

#### NumPy creating arrays

In [5]:
my_list = range(10)

## Create array using arange
print(np.arange(10))
print(np.arange(0, 10))
print(np.arange(0, 10, 2))

## Create array of zeros
print(np.zeros(3))
print(np.zeros((3,3)))

## Create array of ones
print(np.ones(3))
print(np.ones((3,3)))
print(np.ones(3)*4)

## Create evenly spaced vector
### Example use case: when you have Y values for a plot but need to generate X values
### *** Includes both start an end 
# np.arange(start, end(not included), step size)
# np.linspace(start, end(included), number_of_points)
print(np.linspace(0, 10, 5))
print(np.linspace(1900, 2000, 11))

[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]
[0. 0. 0.]
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[1. 1. 1.]
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
[4. 4. 4.]
[ 0.   2.5  5.   7.5 10. ]
[1900. 1910. 1920. 1930. 1940. 1950. 1960. 1970. 1980. 1990. 2000.]


#### Create an identify matrix

In [6]:
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

#### Creating an empty array

In [7]:
np.empty((2,3))

array([[4.9e-324, 9.9e-324, 1.5e-323],
       [4.9e-324, 9.9e-324, 1.5e-323]])

#### Creating Random Numbers
https://docs.scipy.org/doc/numpy-1.14.0/reference/routines.random.html

In [8]:
# Uniform distribution 
print(np.random.rand(3))
print(np.random.rand(3,3))

# Normal distribution
print(np.random.randn(3))
print(np.random.randn(3,3))


# Random integers
# np.random.randint(start, end(not_included), size)

print(np.random.randint(1,101))
print(np.random.randint(1,101,5))

[0.82381461 0.08481409 0.24901753]
[[0.98388329 0.69364026 0.68804098]
 [0.15836186 0.61717897 0.37239912]
 [0.06996152 0.53067113 0.48564033]]
[0.37579885 1.01775925 0.48686442]
[[-0.45121331 -0.28814867  0.26173505]
 [-1.75966337  1.28575948  0.64130561]
 [-0.90438864 -0.70048992 -0.96333942]]
66
[49  6 38 44 80]


#### Reshaping arrays

In [9]:
vector = np.arange(1,10)
print(vector.reshape(3,3))

vector = np.arange(1,13)
print(vector.reshape(3,4))

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


#### Basic array operations

In [11]:
vector = np.random.randint(1,50,25)
print(vector)

# Min
print(vector.min())

# Max
print(vector.max())

# get location of min value
index = vector.argmin()
print(index)

# get location of max value
index = vector.argmax()
print(index)

# get shape
print(vector.shape)

my_matrix = vector.reshape(5, 5)
print(my_matrix.shape)

[ 1 15 45 18 37 40 24 35 20  3 34 47  3  7  5 11  4 10 18 26 35  5 13  8
 39]
1
47
0
11
(25,)
(5, 5)


#### Indexing a 1-D array -- vector

In [12]:
vector = np.array(range(10))
# vector[index]
# vector [start:end]
# vector [:end]
# vector [start:]
# vector [start, end, step]

print(vector[3])
print(vector[3:8])
print(vector[:5])
print(vector[5:])
print(vector[3:9:2])
print(vector[-1])

3
[3 4 5 6 7]
[0 1 2 3 4]
[5 6 7 8 9]
[3 5 7]
9


#### Setting multiple values at once -- Broadcasting

- There are two main features of NumPy arrays
  - Broadcasting -- set multiple values at once
  - Vectorization -- no need for explicit looping -- example, vector multiplication or squaring

In [13]:
vector[3:6] = 12

In [14]:
print(vector)

[ 0  1  2 12 12 12  6  7  8  9]


- If you store a slice of an array in a new variable, changes in the new variable will be reflected in the original array. 

In [15]:
vector = np.array(range(10))
my_slice = vector[3:7]
my_slice[:] = 20
print(vector)

[ 0  1  2 20 20 20 20  7  8  9]


- Copy the array if you need a copy

In [16]:
vector = np.array(range(10))
my_slice_copy = vector[3:7].copy()
print(vector)

[0 1 2 3 4 5 6 7 8 9]


#### Indexing a 2-D array -- Matrix

In [17]:
matrix = np.array(range(1,10)).reshape((3,3))
print(matrix)

print(matrix[0,0])
print(matrix[0][0]) 
print(matrix[2,2])  
print(matrix[2][2]) 

print(matrix[:,2]) # Grab the third column
print(matrix[1,:]) # Grab the second row
print(matrix[:2]) # grab the first two rows, all columns
print(matrix[:2,:]) # grab the first two rows, all columns
print(matrix[:,1:]) # grab all the rows, but columns starting from 1

[[1 2 3]
 [4 5 6]
 [7 8 9]]
1
1
9
9
[3 6 9]
[4 5 6]
[[1 2 3]
 [4 5 6]]
[[1 2 3]
 [4 5 6]]
[[2 3]
 [5 6]
 [8 9]]


#### Conditional selection

In [18]:
vector = np.arange(10)
gt2 = vector > 2 # create condition
lt8 = vector < 8 # create condition

selected_gt2 = vector[gt2] # apply condition to select
selected_lt8 = vector[lt8] # apply condition to select

print(vector[vector>2])
print(vector[vector<8])

cond = (vector>2) & (vector<7)
print(vector[cond])

cond = (vector>=2) & (vector<=7)
print(vector[cond])

[3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7]
[3 4 5 6]
[2 3 4 5 6 7]


#### Array operations -- Basic

In [20]:
vector = np.arange(10)

print(vector + vector)
print(vector - vector)
print(vector * vector)
print(vector / vector)# problem!!! return `nan` 
print(vector + 10)
print(vector - 10)
print(vector * 10)
print(vector / 10)

[ 0  2  4  6  8 10 12 14 16 18]
[0 0 0 0 0 0 0 0 0 0]
[ 0  1  4  9 16 25 36 49 64 81]
[nan  1.  1.  1.  1.  1.  1.  1.  1.  1.]
[10 11 12 13 14 15 16 17 18 19]
[-10  -9  -8  -7  -6  -5  -4  -3  -2  -1]
[ 0 10 20 30 40 50 60 70 80 90]
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]


  print(vector / vector)# problem!!! return `nan`


#### Array operations -- Advanced

- Ref: https://docs.scipy.org/doc/numpy/reference/ufuncs.html#math-operations
- https://stackoverflow.com/questions/25773245/ambiguity-in-pandas-dataframe-numpy-array-axis-definition/43413031

In [22]:
vector = np.arange(10)
print(np.max(vector))
print(np.min(vector))
print(np.sqrt(vector))
print(np.log(vector))

print(sum(vector<5))
import math
vector = np.arange(1,11) * math.pi 
print(np.sin(vector))
vector = np.arange(0,math.pi+math.pi/4,math.pi/4)
print(np.sin(vector))

matrix = np.random.rand(5,5)
print(np.floor(matrix*1000)/1000)
print(np.round(matrix*1000)/1000)
print(np.ceil(matrix*1000)/1000)

matrix = np.arange(1,10).reshape(3,3)
print(matrix.sum(axis=1)) 
print(matrix.sum(axis=0)) 
print(matrix.cumsum()) 
print(matrix.cumprod()) 

print(matrix.min(axis=1)) 
print(matrix.min(axis=0)) 

print(matrix.max(axis=1)) 
print(matrix.max(axis=0)) 

matrix = np.array([1,2,3]*3).reshape(3,3)
print(np.unique(matrix.reshape(3,3)))

9
0
[0.         1.         1.41421356 1.73205081 2.         2.23606798
 2.44948974 2.64575131 2.82842712 3.        ]
[      -inf 0.         0.69314718 1.09861229 1.38629436 1.60943791
 1.79175947 1.94591015 2.07944154 2.19722458]
5
[ 1.22464680e-16 -2.44929360e-16  3.67394040e-16 -4.89858720e-16
  6.12323400e-16 -7.34788079e-16  8.57252759e-16 -9.79717439e-16
  1.10218212e-15 -1.22464680e-15]
[0.00000000e+00 7.07106781e-01 1.00000000e+00 7.07106781e-01
 1.22464680e-16]
[[0.028 0.156 0.357 0.297 0.111]
 [0.275 0.953 0.304 0.787 0.974]
 [0.596 0.877 0.668 0.991 0.806]
 [0.659 0.858 0.866 0.557 0.618]
 [0.568 0.289 0.13  0.348 0.15 ]]
[[0.029 0.157 0.358 0.298 0.111]
 [0.275 0.953 0.305 0.788 0.975]
 [0.597 0.877 0.668 0.991 0.806]
 [0.659 0.859 0.867 0.558 0.619]
 [0.568 0.289 0.131 0.349 0.151]]
[[0.029 0.157 0.358 0.298 0.112]
 [0.276 0.954 0.305 0.788 0.975]
 [0.597 0.878 0.669 0.992 0.807]
 [0.66  0.859 0.867 0.558 0.619]
 [0.569 0.29  0.131 0.349 0.151]]
[ 6 15 24]
[12 15 18]
[ 1  3

  print(np.log(vector))


## PANDAS

Refer - https://mkzia.github.io/eas503-notes/np_pd/np_pd.html

## MACHINE LEARNING

Refer - https://mkzia.github.io/eas503-notes/machine_learning.html