Note: Normally, Python has to check the type of each Python entry. To do data analysis quickly, it is best to tell Python what sort of type each entry is, e.g. using numpy arrays

In [1]:
import numpy as np

l = [[1, 2, 3], [3, 6, 9], [2, 4, 6]] # create a list
a = np.array(l) # convert a list to an array
print(a)

[[1 2 3]
 [3 6 9]
 [2 4 6]]


In [2]:
a.shape

(3, 3)

In [4]:
print(a.dtype) # get type of an array 

print(a[1,2]) # arrays can be given comma-separated indices

print(a[1, 1:3]) # and sliced
print(a[:, 1])

int64
9
[6 9]
[2 6 4]


In [5]:
# can assign values, e.g.
a[:, 0] = [0, 9, 8]
print(a)

[[0 2 3]
 [9 6 9]
 [8 4 6]]


In [6]:
# specify you want an array of zeros as an integer
n = 1000
my_int_array = np.zeros(n, dtype=np.int)
my_int_array.dtype

dtype('int64')

In [7]:
# specify you want arange as being floats
d = np.arange(5, dtype=np.float)
print(d)

[0. 1. 2. 3. 4.]


In [8]:
arr = np.arange(100, 200)
select = [5, 25, 50, 75, -5]
print(arr[select]) # can use integer array as indices

[105 125 150 175 195]


In [12]:
arr = np.arange(10, 20)
div_by_3 = arr%3 == 0 #comparison produces boolean array
print(div_by_3)

# can use boolean lists as indices
print(arr[div_by_3])

[False False  True False False  True False False  True False]
[12 15 18]


In [14]:
# Can divide by arrays
a = np.arange(4.0)
print(a)
b = a * 23.4
print(b)
c = b/(a+1)
print(c)

[0. 1. 2. 3.]
[ 0.  23.4 46.8 70.2]
[ 0.   11.7  15.6  17.55]


### Going between arrays of different dimensions

In [19]:
print(arr)

b = arr[1:].reshape((3,3)) #the [1:] removes first of the 10 elements
print(b)

b_2 = b%2 == 0
b_3 = b%3 == 0
b_2_3 = b_2 & b_3 # true when divisible by both 2 and 3
print(b_2_3)

i_2_3 = b_2_3.nonzero() # select array elements with boolean arrays
print(i_2_3)

# [0, 2] gives the indices of the first dimension
# [1, 1] gives the indices of the second dimension
# In b, 12 is at [0, 1], 18 is at [2, 1]

[10 11 12 13 14 15 16 17 18 19]
[[11 12 13]
 [14 15 16]
 [17 18 19]]
[[False  True False]
 [False False False]
 [False  True False]]
(array([0, 2]), array([1, 1]))


### Other methods

In [22]:
print(arr.sum()) # sum
print(arr.mean()) # mean
print(arr.std()) # standard deviation
print(arr.max()) # max value
print(arr.min()) # min value

145
14.5
2.8722813232690143
19
10


In [24]:
print(div_by_3.all())
print(div_by_3.any())
print(div_by_3.sum())
print(div_by_3.nonzero())  # note singleton tuple returned
                            # for consistency with N-dim case

False
True
3
(array([2, 5, 8]),)


### Sorting

In [28]:
# .sort() is a method, so it acts on the array itself
arr = np.array([4.5, 2.3, 6.7, 1.2, 1.8, 1.5])
print(arr)
arr.sort()
print(arr)

# .argsort() gives the indices of the sorted array
x = np.array([4.5, 2.3, 6.7, 1.2, 1.8, 1.5])
print(x)
s = x.argsort()
print(s)
# x[s] is a sorted copy of x
print(x[s])
# x is unchanged
print(x)

[4.5 2.3 6.7 1.2 1.8 1.5]
[1.2 1.5 1.8 2.3 4.5 6.7]
[4.5 2.3 6.7 1.2 1.8 1.5]
[3 5 4 1 0 2]
[1.2 1.5 1.8 2.3 4.5 6.7]
[4.5 2.3 6.7 1.2 1.8 1.5]


## Matrices

In [32]:
# Can use regular numpy arrays as matrices and vectors

a = np.array([[1, 0],
             [0, 1]])
b = np.array([[4, 1],
             [2, 2]])
c = np.array([1, 2])

# Multiplying matrices
print(np.matmul(a, b))
print(a @ b)
print(a @ c)

# Transpose - capital T
print(b.T)

# Determinant
print(np.linalg.det(b))

# Print eigenvectors
print(np.linalg.eig(b))

[[4 1]
 [2 2]]
[[4 1]
 [2 2]]
[1 2]
[[4 2]
 [1 2]]
6.0
(array([4.73205081, 1.26794919]), array([[ 0.80689822, -0.34372377],
       [ 0.59069049,  0.9390708 ]]))


## Array functions (not methods)

In [34]:
# Most array methods have a function version, e.g.
print(arr.sum())
print(np.sum(arr))

# Array functions often return a rsult, leaving the array unchanged
# Array methods often perform the operation in-place
a = np.array([23, 7, 80])
s = np.sort(a)

print(a, s)

a.sort()
print(a)

18.0
18.0
[23  7 80] [ 7 23 80]


In [37]:
# Many array functions can take an axis, with the operation only
# applied along that one direction in the array
a = np.array([[19, 18, 17],
              [16, 15, 14],
              [13, 12, 11]])
print(a.sum())
print(a.sum(axis=0)) # sum down columns (first index varies)
print(a.sum(axis=1)) # sum down rows (second index varies)

print(np.sort(a)) # by default, sorts along rows
print(np.sort(a, axis=0)) # sorts down the columns

135
[48 45 42]
[54 45 36]
[[17 18 19]
 [14 15 16]
 [11 12 13]]
[[13 12 11]
 [16 15 14]
 [19 18 17]]


## Random numbers

In [41]:
np.random.seed(12345)
print(np.random.uniform())

print(np.random.uniform(-1, 1, 3))

r = np.random.normal(loc=3.0, scale=1.3, size=100)
print(r.mean(), r.std())

p = np.random.poisson(123, size=(1024, 1024))
print(p.shape, p.mean(), p.std()**2)

0.9296160928171479
[-0.36724889 -0.63216238 -0.59087944]
3.0139425563996376 1.3635966442456968
(1024, 1024) 123.0230770111084 122.99512823341865


## Numpy - recarray

Array usually have a homogeneous type, but different type arrays can be combined - with a recarray

Better to use Pandas or Astropy tables (see later)

In [42]:
x = np.arange(0,100)
y = np.sqrt(x)
z = y.astype(np.int)
r = np.rec.array((x,y,z), names=('x', 'y', 'z'))
print(r.x)
print(r.y)
print(r.z)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
[0.         1.         1.41421356 1.73205081 2.         2.23606798
 2.44948974 2.64575131 2.82842712 3.         3.16227766 3.31662479
 3.46410162 3.60555128 3.74165739 3.87298335 4.         4.12310563
 4.24264069 4.35889894 4.47213595 4.58257569 4.69041576 4.79583152
 4.89897949 5.         5.09901951 5.19615242 5.29150262 5.38516481
 5.47722558 5.56776436 5.65685425 5.74456265 5.83095189 5.91607978
 6.         6.08276253 6.164414   6.244998   6.32455532 6.40312424
 6.4807407  6.55743852 6.63324958 6.70820393 6.78232998 6.8556546
 6.92820323 7.         7.07106781 7.14142843 7.21110255 7.28010989
 7.34846923 7.41619849 7.48331477 7.54983444 7.61577311 7.68114575
 7.74596669 7.81024968 7.

## Numpy - loading and saving data
Again, better ways to do this than using Numpy - better to use Pandas or Astropy

In [None]:
np.savetxt('mydata', r, fmt="%6i %12.6f %6i")
data = np.genfromtxt('mydata') # reads a 2d array
data = np.recfromtxt('myfile.txt', names=('x', 'y', 'z'))

# other recfrom[***] exist, e.g. recfromcsv for .csv files

# Numpy - using arrays wisely
- Array operations are implemented in C or Fortran
    - Optimised algorithms (i.e. fast!)
- Python loops (i.e. for i in a:) are much slower
    - Don't use loops if possibly, use array operations
- Also produces shorter code, often much more readable

By default, Numpy arrays are loaded into your RAM. If you are using very large data sets, you may hit the RAM limit of your computer. 


## Saving memory

Save memory by using lower precision where possible

```python 
d = np.arange(10000000, dtype=np.int32) # default is 64-bit integer
                                        # unnecessary for 8-digit 
                                        # numbers
d = np.arange(1e8, dtype=np.float32)
```

Save memory by performing operations in place where possible
```python
a = np.arange(10000000) # 1e8 * 64 / 8 ~800Mb
b = np.random.normal(0, 1000, 10000000) # also ~800Mb
a = a + b # requires an additional 800Mb memory
a += b # in place: no more memory required and faster
a = np.sqrt(a) # requires extra 800Mb memory
np.sqrt(a, out=a) # in place: no more memory required
```

Use sparse arrays (provided by SciPy, see later)
Use a solution which keep sdata on disk (np.memmap, PyTables)
Change your algorithm