# Math with Numpy and Pandas

In [None]:
import numpy as np
import pandas as pd
import time
import math

<img src="https://drive.google.com/uc?export=view&id=17vrPmO6pUyio8yLpFSrSVu7jdR4Aeyj9" width="600"/>


## Creating Arrays

In [None]:
# 1D array
x = np.array([1,2,3,4,5,6,7])
x

array([1, 2, 3, 4, 5, 6, 7])

In [None]:
# shape
x.shape

(7,)

In [None]:
# 2D array
y = np.array([(1,2),(3,4)])
y

array([[1, 2],
       [3, 4]])

In [None]:
y.shape

(2, 2)

In [None]:
# 3D array

w = np.array([((1,1,1), (1,1,1), (1,1,1)),
              ((2,2,2), (2,2,2), (2,2,2))])
w

array([[[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[2, 2, 2],
        [2, 2, 2],
        [2, 2, 2]]])

In [None]:
w.shape

(2, 3, 3)

In [None]:
# zeros
so_many_zeros = np.zeros(shape = (2,2))
so_many_zeros

array([[0., 0.],
       [0., 0.]])

In [None]:
# ones
so_many_ones = np.ones(shape = (2,2))
so_many_ones

array([[1., 1.],
       [1., 1.]])

In [None]:
# range
z = np.arange(0,10,2)
z

array([0, 2, 4, 6, 8])

In [None]:
# sequence
a = np.linspace(0,10,20)
a

array([ 0.        ,  0.52631579,  1.05263158,  1.57894737,  2.10526316,
        2.63157895,  3.15789474,  3.68421053,  4.21052632,  4.73684211,
        5.26315789,  5.78947368,  6.31578947,  6.84210526,  7.36842105,
        7.89473684,  8.42105263,  8.94736842,  9.47368421, 10.        ])

In [None]:
# random
np.random.random((10,10))

array([[9.40748268e-01, 3.18637108e-02, 6.73022763e-01, 5.66319900e-01,
        9.47577797e-02, 6.06725341e-01, 9.18601551e-01, 8.59268650e-04,
        4.39108436e-01, 6.81504065e-01],
       [4.02297637e-01, 8.93449851e-01, 1.33778500e-01, 7.36339154e-01,
        8.17732238e-01, 8.00922757e-01, 7.84765571e-01, 7.31773754e-01,
        1.80564749e-01, 8.91177236e-02],
       [9.33879319e-01, 2.91880403e-01, 1.23430227e-01, 2.84669881e-02,
        1.85491934e-01, 5.22722348e-01, 5.52792895e-01, 1.89359074e-01,
        6.96174843e-01, 5.74059438e-01],
       [1.63982488e-01, 9.96098933e-01, 4.91878853e-01, 2.96353995e-01,
        5.47362018e-01, 3.45274806e-01, 6.87088344e-01, 4.25976126e-01,
        8.78101744e-01, 2.35824240e-01],
       [9.13098523e-01, 8.02370597e-01, 6.59184485e-01, 9.50076094e-01,
        9.89674292e-01, 7.90504686e-01, 3.08744271e-01, 6.80396934e-01,
        5.76319883e-02, 9.27393897e-02],
       [8.07986990e-01, 8.99553245e-01, 8.72553383e-01, 3.00762114e-01,
   

## Review of Vectorization

In [None]:
# create arrays
x = np.array([2,2,2,2])
y = np.array([1,2,3,4])

In [None]:
# addition
x + y

array([3, 4, 5, 6])

In [None]:
# multiplication
x*y

array([2, 4, 6, 8])

In [None]:
# masking

mask = np.array([1,0,1,1,0,0,0,0])
out = np.array([10,1,4,5,6,7,8,8])

mask * out

array([10,  0,  4,  5,  0,  0,  0,  0])

In [None]:
# division
y/x

array([0.5, 1. , 1.5, 2. ])

In [None]:
# comparison
z = np.array([-2,1,0,4,-3,7])
z > 0

array([False,  True, False,  True, False,  True])

In [None]:
# dot product/linear combinations
gradeweights = np.array([0.2,0.2,0.2,0.2,0.2])
grades = np.array([90,80,70,100,75])

final_grade = gradeweights.dot(grades)
final_grade

83.0

## Distance and Norms

Norms are a way of measuring the *size* of a vector. An $L_p$ norm takes the **sum** of all the elements in the vector to the $p$th power, and takes the $p$th root of that sum.

$$L_p = \sqrt[p]{\sum_{i=1}^{n} x_i^p}$$

<img src="https://drive.google.com/uc?export=view&id=17wpdRjx0b4eWGZXuo9V2Tz9h8WMBmnSL" width="300"/>


In [None]:
# size/norm of array
x = np.array([4,3])

np.linalg.norm(x)

5.0

A common norm you may know: Euclidean Distance.

**Euclidean Distance** is measured as:

$$\text{dist} = \sqrt{\sum_{i=1}^n (\color{red}{x_i-y_y})^2}$$


Notice, this is just the $L_2$ Norm of the difference between $\mathbf{x}$ and $\mathbf{y}$. That's because Euclidean Distance is measuring the *size* of the vector that goes from $\mathbf{x}$ to $\mathbf{y}$.

<img src="https://drive.google.com/uc?export=view&id=1iVoTY7hS4x7IiS-_weewV3QtDYOYWJYH" width="300"/>


In [None]:
# distance
y = np.array([1,4])

np.linalg.norm(x-y)

3.1622776601683795

## Mean, SD, Covariance with Numpy

In [None]:
# create arrays
x = np.array([-2.16,-0.43,1.99,1.84,-1.44,-2.37,1.61,2.93,-0.29,-3.17])
y = np.array([-5.02,1.65,4.67,1.32,-6.1,-1.03,7.6,7.91,-3.89,0.24])

In [None]:
# mean
np.mean(x)

-0.14900000000000002

In [None]:
# standard deviation
np.std(x)

2.0242205907459794

In [None]:
# std by hand
np.sqrt(np.sum((x - np.mean(x))**2)/len(x))

2.0242205907459794

In [None]:
# correlation matrix
np.corrcoef(x,y)

array([[1.        , 0.73704025],
       [0.73704025, 1.        ]])

## LeetCode/Interview Questions with Numpy


### Simple Linear Regression from Scratch with Numpy

We know that in a simple linear regression model with one predictor, the coefficient is:

$$b = \rho*\frac{s_y}{s_x} = \frac{cov(x,y)}{var(x)}$$

and the intercept will be:

$$a = \bar{y} - b*\bar{x}$$

where $\bar{y}$ is the mean of the outcome, $y$ and $\bar{x}$ is the mean of the predictor, $x$.

In [None]:
b = np.corrcoef(x,y)[0,1] * (np.std(y)/np.std(x))
b

1.7072893046902855

In [None]:
a = np.mean(y) - b*np.mean(x)
a

0.9893861063988525

### Prime Check

We'll write a function checking if a number is prime. To be efficient, we'll only check values $\leq \sqrt{n}$. This is because, if a number cannot have a pair of prime factors where *both* are $\gt \sqrt{n}$. Thus, if it has a prime factor other than 1 and itself, one must be between 2 and $\sqrt{n}$ (inclusive).

You might remember this from CPSC 230!



In [None]:
n = 97

In [None]:
def is_prime(n):
  for div in range(2,int(math.sqrt(n)) + 1):
    if n%div == 0:
      return(False)
  else:
    return(True)

is_prime(n)

True

In [None]:
def is_prime(n):
  check = np.arange(2,int(np.sqrt(n)) + 1)
  return(not np.any(n%check == 0))
is_prime(n)

True

### Mice Eating Cheese
From [leetcode](https://leetcode.com/problems/mice-and-cheese/description/)

There are two mice and `n` different types of cheese, each type of cheese should be eaten by exactly one mouse.

A point of the cheese with index `i` (0-indexed) is:
- `reward1[i]` if the first mouse eats it.
- `reward2[i]` if the second mouse eats it.

You are given a positive integer array `reward1`, a positive integer array `reward2`, and a non-negative integer `k`.

Return the maximum points the mice can achieve if the first mouse eats exactly `k` types of cheese.


(HINT: `np.argpartition(list,-k)[-k:]` returns the indices of the k *largest* values in `list`)



In [None]:
def max_reward(reward1, reward2, k):

  if k == 0:
    return(np.sum(reward2))

  diff = reward1 - reward2 # positive means first mouse better
  mouse1_eats = np.argpartition(diff,-k)[-k:] # grab k largest differences
  mouse1_mask = np.isin(np.arange(0,len(reward1)), mouse1_eats) # create mask

  # use mask to get reward
  reward = sum(mouse1_mask*reward1) + sum((1-mouse1_mask)*reward2)
  return(reward)

In [None]:
x = np.array([1,1])
y = np.array([1,1])
k = 2

In [None]:
max_reward(x,y,k)

2