# A Quick Introduction to Numerical Data Manipulation with Python and NumPy

In [1]:
import numpy as np

## 1. DataTypes and attributes

**NOTE:** important to remember the main type in NumPy is `ndarray`, even seemingly different kinds of arrays are still `ndarray`'s. This means an operation you do on one array, will work on another.

In [2]:
a1 = np.array([1,2,3])
a1

array([1, 2, 3])

In [3]:
# type
type(a1)

numpy.ndarray

In [16]:
a2 = np.array([[1,2.2,3.0],[4,5,6]])
a2

array([[1. , 2.2, 3. ],
       [4. , 5. , 6. ]])

In [17]:
a3 = np.array([[[1,2,3],[4,5,6],[7,8,9]],[[10,11,12],[13,14,15],[16,17,18]]])
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [18]:
# shape

print(a1.shape)
print(a2.shape)
print(a3.shape)

(3,)
(2, 3)
(2, 3, 3)


In [19]:
# ndim

a1.ndim, a2.ndim, a3.ndim

(1, 2, 3)

In [22]:
# dtype

a1.dtype, a2.dtype, a3.dtype

(dtype('int32'), dtype('float64'), dtype('int32'))

In [23]:
# size

a1.size, a2.size, a3.size

(3, 6, 18)

In [28]:
# pandas Dataframe and NumPy arrays
import pandas as pd

df = pd.DataFrame(a2)
df

Unnamed: 0,0,1,2
0,1.0,2.2,3.0
1,4.0,5.0,6.0


In [30]:
df2 = pd.DataFrame(a2, columns = ["a","b","c"])
df2

Unnamed: 0,a,b,c
0,1.0,2.2,3.0
1,4.0,5.0,6.0


## 2. Creating arrays

In [31]:
sample_array = np.array([1,2,3])
sample_array

array([1, 2, 3])

In [36]:
# np.ones()

ones = np.ones((5,2))
ones

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [39]:
# np.zeros()

zeros = np.zeros((2,6))
zeros

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [40]:
# dtype with zeros or ones

zerosInt = np.zeros((2,3), dtype = int)
zerosInt

array([[0, 0, 0],
       [0, 0, 0]])

In [46]:
# range array

range_array = np.arange(0,10)
range_array

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [55]:
# random

random_array = np.random.randint(2,10,(2,2))
random_array

array([[8, 7],
       [8, 6]])

In [59]:
random_array2 = np.random.random((5,3))
random_array2

array([[0.99029286, 0.11903301, 0.15641022],
       [0.89232523, 0.12370648, 0.67631235],
       [0.7889151 , 0.95963116, 0.38115774],
       [0.00533503, 0.29141823, 0.25678661],
       [0.16615362, 0.00658664, 0.47260477]])

In [64]:
random_array3 = np.random.rand(2,10)
random_array3

array([[0.68935284, 0.34026086, 0.7729689 , 0.45441185, 0.23160959,
        0.67329425, 0.44730489, 0.16790836, 0.66270911, 0.79307365],
       [0.3442448 , 0.77788684, 0.65950012, 0.19868662, 0.3509101 ,
        0.03073763, 0.06660428, 0.93638856, 0.62408913, 0.68014432]])

NumPy uses pseudo-random numbers, which means, the numbers look random but aren't really, they're predetermined.

For consistency, you might want to keep the random numbers you generate similar throughout experiments.

To do this, you can use `np.random.seed().`

In [90]:
# seed
np.random.seed(1)

random_array4 = np.random.randint(100,size=(2,4))
random_array4


array([[37, 12, 72,  9],
       [75,  5, 79, 64]])

In [96]:
np.random.seed(3)

random_array4 = np.random.random((2,3))
random_array4

array([[0.5507979 , 0.70814782, 0.29090474],
       [0.51082761, 0.89294695, 0.89629309]])

## 3. Viewing arrays and matrices (indexing)

In [101]:
# np.unique
intArray = np.random.randint(0,10,(2,3))
intArray

array([[7, 8, 1],
       [6, 2, 2]])

In [102]:
np.unique(intArray)

array([1, 2, 6, 7, 8])

In [111]:
# indexing

print(a1)

# first element
print("first:",a1[0])

# last element
print("last:",a1[-1])

[1 2 3]
first: 1
last: 3


In [106]:
# first row
a2[0]

array([1. , 2.2, 3. ])

In [108]:
# first column
a2[:,0]

array([1., 4.])

In [115]:
# slicing
print(a3.shape)
print(a3)

(2, 3, 3)
[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[10 11 12]
  [13 14 15]
  [16 17 18]]]


In [117]:
#slicing
a3[:,:2,:2]

array([[[ 1,  2],
        [ 4,  5]],

       [[10, 11],
        [13, 14]]])

## 4. Manipulating and comparying arrays

- Arithmetic
    - +, -, *, /, //, **, %
    - np.exp()
    - np.log()
    - Dot product - np.dot()
    - Broadcasting
- Aggregation
    - np.sum() - faster than .sum(), make demo, np is really fast
    - np.mean()
    - np.std()
    - np.var()
    - np.min()
    - np.max()
    - np.argmin() - find index of minimum value
    - np.argmax() - find index of maximum value
    - These work on all ndarray's
        - a4.min(axis=0) -- you can use axis as well
- Reshaping
np.reshape()
- Transposing
a3.T
- Comparison operators
    - ">"
    - <
    - <=
    - ">="
    - x != 3
    - x == 3
    - np.sum(x > 3)

### Arithmetic

In [120]:
# +
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr1 + arr2
result

array([2., 3., 4.])

In [121]:
# -
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr1 - arr2
result

array([0., 1., 2.])

In [122]:
# *
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr1 * arr2
result

array([1., 2., 3.])

In [123]:
# /
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr2 / arr1
result

array([1.        , 0.5       , 0.33333333])

In [126]:
# //

arr1 = np.array([1,2,3])
arr2 = np.array([2,23,12])

result = arr2 // arr1
result

array([ 2, 11,  4], dtype=int32)

In [130]:
# %

arr1 = np.array([10,24,323])
arr2 = 9

result = arr1 % arr2
result

array([1, 6, 8], dtype=int32)

In [134]:
# **

arr1 = np.array([2,4,10])
arr2 = 2

result = arr1 ** arr2
result

array([  4,  16, 100], dtype=int32)

In [132]:
# broadcasting

a1 = np.array([1,2,3])
a2 = np.array([[2,2,1],[3,5,8]])

a1 * a2

array([[ 2,  4,  3],
       [ 3, 10, 24]])

In [135]:
# log

a1 = np.array([1,2,3])
np.log(a1)

array([0.        , 0.69314718, 1.09861229])

In [136]:
# exp
np.exp(a1)

array([ 2.71828183,  7.3890561 , 20.08553692])

### Aggregation

In [137]:
# sum() : python function

L = [1,2,3]
sum(L)

6

In [138]:
# np.sum() : numpy function

arr = np.array([1,2,3])
np.sum(arr)

6

In [139]:
# difference sum and np.sum

%timeit sum(L) # Python sum()
%timeit np.sum(arr) # NumPy np.sum()

120 ns ± 5.08 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
3.28 µs ± 118 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [141]:
# mean

arr2 = np.array([[1,2,3,4,12,42],[12,12,4,23,1,2]])
np.mean(arr2)

9.833333333333334

In [142]:
# min

np.min(arr2)

1

In [143]:
# max

np.max(arr2)

42

**What's mean?**

Mean is the same as average. You can find the average of a set of numbers by adding them up and dividing them by how many there are.

**What's standard deviation?**

Standard deviation is a measure of how spread out numbers are.

**What's variance?**

The variance is the averaged squared differences of the mean.

In [146]:
# std : standard deviation

np.std(arr2)

11.603399884899636

In [147]:
# variance

np.var(arr2)

134.63888888888889

In [148]:
# sqrt

arr3 = np.array([1,2,3])
np.sqrt(arr3)

array([1.        , 1.41421356, 1.73205081])

### Reshaping

### Transposing