# Advanced NumPy

In [None]:
from __future__ import division
from numpy.random import randn
from pandas import Series
import numpy as np
np.set_printoptions(precision=4)
import sys

## ndarray object internals

### NumPy dtype hierarchy

In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [None]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

In [None]:
np.float64.mro()

## Advanced array manipulation

### Reshaping arrays

In [8]:
import numpy as np

In [11]:
arr = np.arange(8)  # 0~7까지 1차원 배열
arr
arr.ndim

array([0, 1, 2, 3, 4, 5, 6, 7])

1

In [18]:
arr.reshape((4, 2)) 
arr.ndim       
arr.reshape((4, 2)).ndim         # arr.dim(arr.reshape((4, 2)))  xxx

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

1

2

In [19]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [20]:
arr = np.arange(15)   
arr.reshape((5, -1))  # 칼럼이 3개가 생김   # 로우 5개, 나머지는 그냥 두기 (-1)
                     # 2차원

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [27]:
arr.reshape((1,-1,5))                #열 5개   # 3차원
arr.reshape((1,5,-1))                #행 5개 
arr.reshape((1,5,-1)).shape

array([[[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]])

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11],
        [12, 13, 14]]])

(1, 5, 3)

In [29]:
arr.reshape((-1,5,1))
arr.reshape((-1,5,1)).shape

array([[[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4]],

       [[ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9]],

       [[10],
        [11],
        [12],
        [13],
        [14]]])

(3, 5, 1)

In [25]:
x = np.arange(4);x
y = x.reshape((1,4));y      

array([0, 1, 2, 3])

array([[0, 1, 2, 3]])

In [23]:
other_arr = np.ones((3, 5))
other_arr.shape
arr
arr.reshape(other_arr.shape)

(3, 5)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [32]:
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()    # 1차원 배열로 풀어줌.      # ravel()의 경우 원본을 그대로 가공함.

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [33]:
arr.flatten()      # 위의 것과 결과가 같으나, flatten()의 경우 데이터의 복사본을 반환

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C vs. Fortran order

In [35]:
arr = np.arange(12).reshape((3, 4))      # 이 값을 풀기 위해서 
arr
arr.ravel()       # 우측으로 풀기  (가로로)
arr.ravel('F')     # 아래로 풀기   (세로로)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and splitting arrays

In [37]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
arr1
arr2

array([[1, 2, 3],
       [4, 5, 6]])

array([[ 7,  8,  9],
       [10, 11, 12]])

In [36]:
np.concatenate([arr1, arr2], axis=0)  # 밑으로    
np.concatenate([arr1, arr2], axis=1)  # 우측으로   

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [38]:
np.vstack((arr1, arr2))  # axis = 0                # 위의 것을 써도 되는데, 단순 붙이기만 할거면 stack쓸것
np.hstack((arr1, arr2))   # axis = 1       # concatenate 가 옵션이 더 많음

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [None]:
from numpy.random import randn
arr = randn(5, 2)
arr

In [39]:
first, second, third = np.split(arr, [1, 3])          # 처음부터 1번까지
first
second
third
# 로우로 짤랐는데, 
# 칼럼으로 짜르고 싶다면?

array([[ 1.07899571, -1.01547928],
       [ 1.76086988, -0.04847285],
       [-1.26413497,  1.0222526 ],
       [-0.72956006,  0.80637474],
       [ 0.23974508,  0.03633131]])

array([[ 1.07899571, -1.01547928]])

array([[ 1.76086988, -0.04847285],
       [-1.26413497,  1.0222526 ]])

array([[-0.72956006,  0.80637474],
       [ 0.23974508,  0.03633131]])

In [40]:
arr_t  =arr.T;arr_t

array([[ 1.07899571,  1.76086988, -1.26413497, -0.72956006,  0.23974508],
       [-1.01547928, -0.04847285,  1.0222526 ,  0.80637474,  0.03633131]])

In [42]:
np.split(arr_t, [2], axis=1)

[array([[ 1.07899571,  1.76086988],
        [-1.01547928, -0.04847285]]),
 array([[-1.26413497, -0.72956006,  0.23974508],
        [ 1.0222526 ,  0.80637474,  0.03633131]])]

In [43]:
### 샘 메모 참고

#### Stacking helpers: 

In [45]:
arr = np.arange(6)            # 1차원 배열
arr1 = arr.reshape((3, 2))    # 2차원 배열로 만듬   
arr2 = randn(3, 2)            # 2차원 랜덤  3x2
np.r_[arr1, arr2]             #   [ ]            # r_   arr1, arr2 를 쌓아라.아래로(로우로)
np.c_[np.r_[arr1, arr2], arr] #   [ ] 로 씀      # c_                  우측으로  (column으로)       
                              # 브로드캐스팅으로 진행 

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.8566842 ,  0.39990102],
       [-0.22197415,  1.62370651],
       [ 0.62634727,  1.09358294]])

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.8566842 ,  0.39990102,  3.        ],
       [-0.22197415,  1.62370651,  4.        ],
       [ 0.62634727,  1.09358294,  5.        ]])

In [48]:
np.c_[1:6, -10:-5]
     # 1차원 배열(리스트) 2개   => 배열로 만들어줌 colum
np.r_[1:6, -10:-5]

type(np.c_[1:6, -10:-5])
type(np.r_[1:6, -10:-5])

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

array([  1,   2,   3,   4,   5, -10,  -9,  -8,  -7,  -6])

numpy.ndarray

numpy.ndarray

### Repeating elements: tile and repeat

In [None]:
# 한번 따로 해볼 것!!

In [None]:
arr = np.arange(3)
arr.repeat(3)

In [None]:
arr.repeat([2, 3, 4])

In [None]:
arr = randn(2, 2)
arr
arr.repeat(2, axis=0)

In [None]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)

In [None]:
arr
np.tile(arr, 2)

In [None]:
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))

### Fancy indexing equivalents: take and put

In [None]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

In [None]:
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr

In [None]:
inds = [2, 0, 2, 1]
arr = randn(2, 4)
arr
arr.take(inds, axis=1)

## Broadcasting

In [49]:
arr = np.arange(5)
arr
arr * 4
# ndarray * scalar    => 형이 다른 걸 어떻게 하겠는가?
  # 차원 있는 것과 없는 것.... 
      # 차원이 적은 쪽에 차원을 추가해줌.

array([0, 1, 2, 3, 4])

array([ 0,  4,  8, 12, 16])

In [62]:
arr = randn(4, 3)
arr
arr.mean(0)   # 0번축의 대한 평균 / 2차원에서 0번 => 칼럼축
np.mean(arr)  # 이 값 전체의 평균9
demeaned = arr - arr.mean(0) # 4x3(2차)    3(1차)
    #                                       1x3
    #                          4x3    -      4x3   맞춰주고 빼줌.
demeaned
demeaned.mean(0)

array([[-0.79780537,  1.05492176,  0.01813451],
       [ 0.74098007, -0.34665558,  1.3130299 ],
       [-1.7065946 ,  0.19242648, -0.88753311],
       [-1.05843866,  2.11922304, -0.81507266]])

array([-0.70546464,  0.75497893, -0.09286034])

-0.014448684632752729

array([[-0.09234073,  0.29994284,  0.11099485],
       [ 1.44644471, -1.10163451,  1.40589024],
       [-1.00112996, -0.56255245, -0.79467276],
       [-0.35297402,  1.36424412, -0.72221232]])

array([-5.55111512e-17,  0.00000000e+00, -2.77555756e-17])

In [66]:
arr
row_means = arr.mean(1)   # 로우축으로    row_means = 1차원 4
row_means.reshape((4, 1))               # 2차원 배열로 바꿔줌.
         # 4x3        broadcating 으로 4x3이 되버림.
demeaned = arr - row_means.reshape((4, 1))
demeaned
demeaned.mean(1)  #=> 4

array([[-0.79780537,  1.05492176,  0.01813451],
       [ 0.74098007, -0.34665558,  1.3130299 ],
       [-1.7065946 ,  0.19242648, -0.88753311],
       [-1.05843866,  2.11922304, -0.81507266]])

array([[ 0.0917503 ],
       [ 0.56911813],
       [-0.80056708],
       [ 0.08190391]])

array([[-0.88955567,  0.96317146, -0.07361579],
       [ 0.17186194, -0.91577371,  0.74391177],
       [-0.90602753,  0.99299356, -0.08696603],
       [-1.14034256,  2.03731914, -0.89697657]])

array([-1.38777878e-17,  3.70074342e-17, -1.11022302e-16, -7.40148683e-17])

### Broadcasting over other axes

In [67]:
arr - arr.mean(1)    # 브로드캐스팅 안됨.

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [69]:
arr - arr.mean(1).reshape((4, 1))

array([[-0.88955567,  0.96317146, -0.07361579],
       [ 0.17186194, -0.91577371,  0.74391177],
       [-0.90602753,  0.99299356, -0.08696603],
       [-1.14034256,  2.03731914, -0.89697657]])

In [74]:
arr = np.zeros((4, 4))
arr  # 영행렬
arr.shape
arr_3d = arr[:, np.newaxis, :] #np.newaxis 직접적으로 축을 하나 추가해줌.
arr_3d    # 3차원이 됨.
arr_3d.shape
arr[:, : ,np.newaxis]    # 바꿔줌.
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

(4, 4)

array([[[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]]])

(4, 1, 4)

array([[[0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.]]])

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [76]:
arr_1d = np.random.normal(size=3)
arr_1d
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]

array([ 1.09501103,  0.36278269, -0.15178492])

array([[ 1.09501103],
       [ 0.36278269],
       [-0.15178492]])

array([[ 1.09501103,  0.36278269, -0.15178492]])

In [80]:
#----------------------------------------------------------
arr = randn(3, 4, 5)   # 3차원
arr

array([[[-0.61736905,  0.7704303 ,  0.88088465, -0.83410613,
         -0.34991525],
        [ 1.13989042, -0.57112513,  0.40408617, -2.5316769 ,
         -0.17695552],
        [-0.93654999,  0.1525438 , -0.22838529,  1.65228736,
          0.23256456],
        [-0.16377181, -0.6799177 ,  0.73672147,  1.88212219,
          0.03422823]],

       [[ 1.92888914, -1.73486063,  0.04645938, -0.20783981,
         -0.07912764],
        [-1.42696705, -0.52182027,  1.16994154, -2.36584377,
          0.7567333 ],
        [-1.32773582,  1.57911363, -0.98734778,  1.29574666,
         -0.63869192],
        [ 0.40802086, -1.01879412, -1.35827293, -0.15134611,
          1.47893653]],

       [[ 1.1021942 , -0.01736087, -1.07349986, -0.54856566,
          0.090116  ],
        [-0.39558864,  1.941713  , -1.04007359,  1.39729914,
         -0.28471822],
        [-0.6058195 ,  0.41513239, -0.44618091,  0.24692827,
          0.43119386],
        [-0.51965233,  0.01738861,  1.66886371, -0.60305359,
          0

In [83]:
depth_means = arr.mean(2)   # 3차원이 2차원으로 줄게됨.
depth_means

array([[-0.03001509, -0.34715619,  0.17449209,  0.36187648],
       [-0.00929591, -0.47759125, -0.01578305, -0.12829115],
       [-0.08942324,  0.32372634,  0.00825082,  0.27783972]])

In [84]:
demeaned = arr - depth_means[:, :, np.newaxis]       # 3x4x5 - 3x4    바로 뺄 수는 없음.  => 브로드캐스팅 해야함.
demeaned.mean(2) # 2번축   3x4x5  =>   5
#----------------------------------------------------------

array([[-4.44089210e-17,  0.00000000e+00,  1.66533454e-17,
         1.11022302e-17],
       [-3.33066907e-17,  4.44089210e-17, -4.44089210e-17,
         0.00000000e+00],
       [ 5.55111512e-18,  0.00000000e+00,  1.11022302e-17,
        -2.22044605e-17]])

In [None]:
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalized things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### Setting array values by broadcasting

In [85]:
arr = np.zeros((4, 3))
arr

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [86]:
arr[:] = 5   # 모든 행렬의 값이 5로 바뀜.
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [None]:
col = np.array([1.28, -0.42, 0.44, 1.6])   # col 이라는 1차원 배열을 생성.
col

In [87]:
#  arr  =  col
#  4x3      4
arr[:] = col[:, np.newaxis]
        #  4x1           => 4행 1열 2차원 배열이 되고, 복사가 됨.
arr

array([ 1.28, -0.42,  0.44,  1.6 ])

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [88]:
# 4x3
arr[:2] = [[-1.37], [0.509]]
   # 0번부터 1까지 컬럼은 전부 다.
#   2x3        2x1

arr 
# arr[:2,:]  2x3  
#대입 할떄도 브로드캐스팅이 이루어진다. 쉐잍프가 작은 쪽에 1을 추가하여 맞춰준다. / 축츼 크기를 보고 맞춰준다. 1이라면 조절 가능

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## Advanced ufunc usage

### Ufunc instance methods

In [None]:
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

In [None]:
np.random.seed(12346)

In [None]:
arr = randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

In [None]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

In [None]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))

In [None]:
result = np.subtract.outer(randn(3, 4), randn(5))
result.shape

In [None]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

In [None]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

### Custom ufuncs

In [None]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

In [None]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

In [None]:
arr = randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

## Structured and record arrays

In [None]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

In [None]:
sarr[0]
sarr[0]['y']

In [None]:
sarr['x']

### Nested dtypes and multidimensional fields

In [None]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

In [None]:
arr[0]['x']

In [None]:
arr['x']

In [None]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']

### Why use structured arrays?

### Structured array manipulations: numpy.lib.recfunctions

## More about sorting

In [None]:
arr = randn(6)
arr.sort()
arr

In [None]:
arr = randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr

In [None]:
arr = randn(5)
arr
np.sort(arr)
arr

In [None]:
arr = randn(3, 5)
arr
arr.sort(axis=1)
arr

In [None]:
arr[:, ::-1]

### Indirect sorts: argsort and lexsort

In [None]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]

In [None]:
arr = randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]

In [None]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
zip(last_name[sorter], first_name[sorter])

### Alternate sort algorithms

In [None]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)

### numpy.searchsorted: Finding elements in a sorted array

In [None]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

In [None]:
arr.searchsorted([0, 8, 11, 16])

In [None]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')

In [None]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

In [None]:
labels = bins.searchsorted(data)
labels

In [None]:
Series(data).groupby(labels).mean()

In [None]:
np.digitize(data, bins)

## NumPy matrix class

In [None]:
X =  np.array([[ 8.82768214,  3.82222409, -1.14276475,  2.04411587],
               [ 3.82222409,  6.75272284,  0.83909108,  2.08293758],
               [-1.14276475,  0.83909108,  5.01690521,  0.79573241],
               [ 2.04411587,  2.08293758,  0.79573241,  6.24095859]])
X[:, 0]  # one-dimensional
y = X[:, :1]  # two-dimensional by slicing
X
y

In [None]:
np.dot(y.T, np.dot(X, y))

In [None]:
Xm = np.matrix(X)
ym = Xm[:, 0]
Xm
ym
ym.T * Xm * ym

In [None]:
Xm.I * X

## Advanced array input and output

### Memory-mapped files

In [None]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap

In [None]:
section = mmap[:5]

In [None]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [None]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

In [None]:
%xdel mmap
!rm mymmap

### HDF5 and other array storage options

## Performance tips

### The importance of contiguous memory

In [None]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous

In [None]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)

In [None]:
arr_f.copy('C').flags

In [None]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

In [None]:
%xdel arr_c
%xdel arr_f
%cd ..

## Other speed options: Cython, f2py, C

```cython
from numpy cimport ndarray, float64_t

def sum_elements(ndarray[float64_t] arr):
    cdef Py_ssize_t i, n = len(arr)
    cdef float64_t result = 0

    for i in range(n):
        result += arr[i]

    return result
```