In [3]:
import numpy as np

While much of the heavy lifting is handled by higher-level functions in pandas, we may at some point need to write a data algorithm that is not found in one of  the existing libraries.

# Reshaping Arrays
convert an array from one shape to another without copying any data
pass a tuple indicating the new shape to the reshape array instance method

In [4]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [5]:
arr.reshape((4,2)) # in c order (row major)

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [6]:
# multidimensional array can also be reshaped: 
arr.reshape((4,2)).reshape((2,4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [7]:
# One of the passed shape dimensions can be –1, in which case the value used for that dimension will be inferred from the data
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [8]:
# Since an array’s shape attribute is a tuple, it can be passed to reshape, too
other_arr = np.ones((3,5))
print(other_arr.shape)

arr.reshape(other_arr.shape)
# operation of reshape from one-dimensional to a higher dimension is typically known as flattening or raveling

(3, 5)


array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [9]:
print(arr.ravel())
# ravel does not produce a copy of the underlying values if the values in the result were contiguous in the original array
# flatten method behaves like ravel except it always returns a copy of the data
print(arr.flatten())

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


# C Vs FORTRAN Order
- NumPy is able to adapt to many different layouts of your data in memory. 
- By default, NumPy arrays are created in row major order. 
- Spatially this means that if you have a two-dimensional array of data, the items in each row of the array are stored in adjacent memory locations.
- The alternative to row major ordering is column major order, which means that values within each column of data are stored in adjacent memory locations.
- For historical reasons, row and column major order are also known as C and FORTRAN order respectively.

- Functions like reshape and ravel accept an order argument indicating the order to
use the data in the array. This is usually set to 'C' or 'F' in most cases (there are also
less commonly used options 'A' and 'K')

In [10]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [11]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [12]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

The key difference between C and FORTRAN order is the way in which the dimensions are walked: <br>
- *C/row major order* <br>
        Traverse higher dimensions first (e.g., axis 1 before advancing on axis 0). <br>
- *FORTRAN/column major order* <br>
        Traverse higher dimensions last (e.g., axis 0 before advancing on axis 1). <br>

# Concatenating and Splitting Arrays

In [13]:
# np.concatenate takes a sequence (list, tuple, etc) of arrays and joins in order along the input axis
arr1 = np.array([[1,2,3],[4,5,6]])
arr2 = np.array([[7,8,9],[10,11,12]])

print(np.concatenate([arr1, arr2], axis=0))
print(np.concatenate([arr1, arr2], axis=1))

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[[ 1  2  3  7  8  9]
 [ 4  5  6 10 11 12]]


In [14]:
# convenience functions, like vstack and hstack, for common kinds of concatenation
print(np.vstack((arr1, arr2)))
print(np.hstack((arr1, arr2)))

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[[ 1  2  3  7  8  9]
 [ 4  5  6 10 11 12]]


In [18]:
arr = np.random.standard_normal((5,2))
print(arr)

[[-0.08605115 -0.08966722]
 [-0.34608889 -0.43415816]
 [-1.2331784   0.85316658]
 [-0.30950346 -0.78962223]
 [ 0.04910021 -2.21699944]]


In [20]:
# split, on the other hand, slices an array into multiple arrays along an axis
first, second, third = np.split(arr, [1,3])
print(first)
# The value [1, 3] passed to np.split indicates the indices at which to split the array into pieces

[[-0.08605115 -0.08966722]]


In [16]:
print(second)

[[-2.18808218  0.2887396 ]
 [-0.32994779 -1.28600823]]


In [17]:
print(third)

[[-0.3217207  -0.75337989]
 [-0.08286441 -0.51022578]]


### Stacking helpers: r_ and c_
two special objects in the NumPy namespace, r_ and c_, that make stacking arrays more concise:

In [21]:
arr = np.arange(6)
arr1 = arr.reshape((3,2))
arr2 = np.random.standard_normal((3,2))

np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-0.91921524, -0.54640651],
       [ 1.6196499 , -1.70313843],
       [-1.67144347,  2.21345668]])

In [23]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [-0.91921524, -0.54640651,  3.        ],
       [ 1.6196499 , -1.70313843,  4.        ],
       [-1.67144347,  2.21345668,  5.        ]])

These additionally can translate slices to arrays

In [25]:
np.c_[1:6, -10:-5]
# see docstring for more 

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: tile and repeat
two useful tools for repeating or replicating arrays to produce larger arrays are the: repeat and tile functions

In [26]:
# repeat replicates each element in an array some number of times, producing a larger array
arr = np.arange(3)
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [27]:
# By default, if you pass an integer, each element will be repeated that number of times. If you pass an array of integers, each element can be repeated a different number of times.
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [28]:
# Multidimensional arrays can have their elements repeated along a particular axis
arr = np.random.standard_normal((2,2))
print(arr)

[[1.15738492 0.27427883]
 [2.0925324  0.02595351]]


In [29]:
arr.repeat(2, axis=0)

array([[1.15738492, 0.27427883],
       [1.15738492, 0.27427883],
       [2.0925324 , 0.02595351],
       [2.0925324 , 0.02595351]])

In [32]:
arr.repeat(2, axis=1)

array([[1.15738492, 1.15738492, 0.27427883, 0.27427883],
       [2.0925324 , 2.0925324 , 0.02595351, 0.02595351]])

In [33]:
# Similarly, you can pass an array of integers when repeating a multidimensional array to repeat a given slice a different number of times
arr.repeat([2,3], axis=0)

array([[1.15738492, 0.27427883],
       [1.15738492, 0.27427883],
       [2.0925324 , 0.02595351],
       [2.0925324 , 0.02595351],
       [2.0925324 , 0.02595351]])

In [34]:
arr.repeat([2,3], axis=1)

array([[1.15738492, 1.15738492, 0.27427883, 0.27427883, 0.27427883],
       [2.0925324 , 2.0925324 , 0.02595351, 0.02595351, 0.02595351]])

In [37]:
#
# tile is a shortcut for stacking copies of an array along an axis
arr

array([[1.15738492, 0.27427883],
       [2.0925324 , 0.02595351]])

In [38]:
np.tile(arr, 2)

array([[1.15738492, 0.27427883, 1.15738492, 0.27427883],
       [2.0925324 , 0.02595351, 2.0925324 , 0.02595351]])

In [39]:
# The second argument is the number of tiles; with a scalar, the tiling is made row by row, rather than column by column. The second argument to tile can be a tuple indicating the layout of the “tiling”
arr 

array([[1.15738492, 0.27427883],
       [2.0925324 , 0.02595351]])

In [40]:
np.tile(arr, (2,1))

array([[1.15738492, 0.27427883],
       [2.0925324 , 0.02595351],
       [1.15738492, 0.27427883],
       [2.0925324 , 0.02595351]])

In [41]:
np.tile(arr, (3,2))

array([[1.15738492, 0.27427883, 1.15738492, 0.27427883],
       [2.0925324 , 0.02595351, 2.0925324 , 0.02595351],
       [1.15738492, 0.27427883, 1.15738492, 0.27427883],
       [2.0925324 , 0.02595351, 2.0925324 , 0.02595351],
       [1.15738492, 0.27427883, 1.15738492, 0.27427883],
       [2.0925324 , 0.02595351, 2.0925324 , 0.02595351]])

### Fancy Indexing Equivalents: take and put

In [44]:
# one way to get and set subsets of arrays is by fancy indexing using integer arrays
arr = np.arange(10) * 100
inds = [7,1,2,6] # index no
arr[inds]

array([700, 100, 200, 600])

In [45]:
# There are alternative ndarray methods that are useful in the special case of making a selection only on a single axis
arr.take(inds)

array([700, 100, 200, 600])

In [47]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [48]:
arr.put(inds, [40,41,42,43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [49]:
# to use take along other axes, pass the axis keyword
inds = [2, 0, 2, 1]
arr = np.random.standard_normal((2,4))
arr

array([[-0.26815737,  0.81651256,  1.41193647,  1.02506074],
       [ 1.40131289,  0.78250371, -1.26958218,  0.82520194]])

In [51]:
arr.take(inds, axis=1)
# put does not accept an axis argument but rather indexes into the flattened (one-dimensional, C order) version of the array.

array([[ 1.41193647, -0.26815737,  1.41193647,  0.81651256],
       [-1.26958218,  1.40131289, -1.26958218,  0.78250371]])

## Summary of Differences

| Feature                | `numpy.put()`                             | `numpy.take()`                             |
|------------------------|-------------------------------------------|--------------------------------------------|
| **Purpose**            | Replaces elements in an existing array    | Retrieves elements from an existing array   |
| **Modification**       | Modifies the original array                | Returns a new array without modifying      |
| **Return Value**       | None (modifies in place)                  | New ndarray with selected elements         |
| **Flattening Behavior**| Operates on a flattened version            | Can operate on specified axis              |
| **Handling Indices**   | Supports modes for out-of-bounds indices | Does not support modes; raises IndexError for out-of-bounds |
